#!/bin/bash

mintokens=1
minlength=1
maxlength=8
opt_u="-u"
opt_s=""
verbose="0"

usage() {
 echo "Syntax: colibri-freqlist -t threshold -m minlength -l maxlength corpusfile [corpusfile2]..." >&2
 echo "Description: Extract n-grams (and optionally skipgrams) with their counts from one or more plain-text corpus files" >&2
 echo "Arguments: corpusfiles should be plain text files, tokenised, and one sentence per line" >&2
 echo "Options:" >&2
 echo " -t int     Minimum amount of occurrences for a pattern to be included in the model (default: $threshold). Increase this value to decrease memory usage, especially on big corpora." >&2
 echo " -m int     Minimum pattern length (default: $minlength)" >&2
 echo " -l int     Maximum pattern length (default: $maxlength)" >&2
 echo " -s         Compute skipgrams as well" >&2
 echo " -i         (Advanced) Indexed model: show the exact positions (in the form sentence:token) of each occurrence of a pattern (implies -v, requires more memory!)" >&2
 echo " -v         Verbose output" >&2
}

while getopts ht:m:l:isv flag
do
    case "$flag" in
    (h) usage; exit 0;;
    (t) mintokens=$OPTARG;;
    (m) minlength=$OPTARG;;
    (l) maxlength=$OPTARG;;
    (i) verbose=1; opt_u="";;
    (s) opt_s="-s";;
    (v) verbose=1;;
    (*) usage; exit 0;;
    esac
done
shift $(expr $OPTIND - 1)

if [ $# -eq 0 ]; then
    usage
    exit 0
fi

tmp=$(mktemp --tmpdir=.)

echo "Class encoding corpora...">&2
colibri-classencode -o ${tmp} -u $@

echo "Building pattern model...">&2
if [[ "$verbose" == "1" ]]; then
    colibri-patternmodeller -c ${tmp}.colibri.cls -f ${tmp}.colibri.dat -t $mintokens -m $minlength -l $maxlength $opt_u $opt_s -P | sort -k2,2rn -k1,1 -t $'\t'
else
    colibri-patternmodeller -c ${tmp}.colibri.cls -f ${tmp}.colibri.dat -t $mintokens -m $minlength -l $maxlength $opt_u $opt_s -P | cut -f1,2 | sort -k2,2rn -k1,1 -t $'\t' | head -n -1
fi

#cleanup
rm ${tmp}.colibri.cls
rm ${tmp}.colibri.dat

