#!/bin/bash

mintokens=2
pmithreshold=0.5
minlength=1
maxlength=8
opt_s=""
mode="absolute"

usage() {
 echo "Syntax: colibri-cooc -t threshold -m minlength -l maxlength corpusfile [corpusfile2]..." >&2
 echo "Description: Computes co-occurrence statistics (absolute co-cooccurrence or pointwise mutual information) between patterns in a corpus" >&2
 echo "Arguments: corpusfiles should be plain text files, tokenised, and one sentence per line" >&2
 echo "Options:" >&2
 echo " -t int     Absolute co-occurrence threshold: Show patterns co-occuring at least this many times" >&2
 echo " -T int     Co-occurrence threshold according to normalised mutual pointwise information [-1,1]" >&2
 echo " -m int     Minimum pattern length (default: $maxlength)" >&2
 echo " -l int     Maximum pattern length (default: $maxlength)" >&2
 echo " -s         Compute skipgrams as well" >&2
}

while getopts ht:T:m:l:is flag
do
    case "$flag" in
    (h) usage; exit 0;;
    (t) mintokens=$OPTARG;;
    (T) mode="relative"; pmithreshold=$OPTARG;;
    (m) minlength=$OPTARG;;
    (l) maxlength=$OPTARG;;
    (s) opt_s="";;
    (*) usage; exit 0;;
    esac
done
shift $(expr $OPTIND - 1)

if [ $# -eq 0 ]; then
    usage
    exit 0
fi

echo "Class encoding corpora...">&2
colibri-classencode -o tmp -u $@

echo "Building pattern model...">&2
if [[ "$mode" == "absolute" ]]; then
    colibri-patternmodeller -c tmp.colibri.cls -f tmp.colibri.dat -t $mintokens -C $mintokens -m $minlength -l $maxlength  $opt_s
else
    colibri-patternmodeller -c tmp.colibri.cls -f tmp.colibri.dat -t $mintokens -Y $pmithreshold -m $minlength -l $maxlength  $opt_s
fi

#cleanup
rm tmp.colibri.cls
rm tmp.colibri.dat

