#!/bin/bash

mintokens=2
minlength=1
maxlength=1
opt_a=

usage() {
 echo "Syntax: colibri-loglikelihood [-a] -t threshold -m minlength -l maxlength corpus1 corpus2..." >&2
 echo "Description: Compares the frequency of patterns between two or more corpus files (plain text) by computing log likelihood, following the methodology of Rayson and Garside (2000), Comparing corpora using frequency profiling. In proceedings of the workshop on Comparing Corpora, held in conjunction with the 38th annual meeting of the Association for Computational Linguistics (ACL 2000). 1-8 October 2000, Hong Kong, pp. 1 - 6: http://www.comp.lancs.ac.uk/~paul/publications/rg_acl2000.pdf"  >&2
 echo "Arguments: corpus1, corpus2 and further should be plain text files, preferably tokenised" >&2
 echo "Options:" >&2
 echo " -t int     Minimum amount of occurrences for a pattern to be included in the model (default: $mintokens)" >&2
 echo " -m int     Minimum pattern length (default: $minlength)" >&2
 echo " -l int     Maximum pattern length (default: $maxlength)" >&2
 echo " -a         Consider only patterns that occur in all specified corpora" >&2
}

while getopts ht:m:l:a flag
do
    case "$flag" in
    (h) usage; exit 0;;
    (t) mintokens=$OPTARG;;
    (m) minlength=$OPTARG;;
    (l) maxlength=$OPTARG;;
    (a) opt_a="-a";;
    (*) usage; exit 0;;
    esac
done
shift $(expr $OPTIND - 1)

if [ $# -eq 0 ]; then
    usage
    exit 0
fi

if [ $minlength -gt $maxlength ]; then
    maxlength=$minlength
fi

echo "Class encoding corpora...">&2
colibri-classencode -o tmp $@

models=""
for textfile in "$@"
do
    echo "Building pattern model for ${textfile}...">&2
    colibri-patternmodeller -o ${textfile%.txt}.colibri.patternmodel -f ${textfile%.txt}.colibri.dat -t $mintokens -m $minlength -l $maxlength -u
    if [ ! -z "$models" ]; then
        models="$models ${textfile%.txt}.colibri.patternmodel"
    else
        models="${textfile%.txt}.colibri.patternmodel"
    fi
done

cmd="colibri-comparemodels -c tmp.colibri.cls -l $maxlength -m $minlength $opt_a $models"
echo "Running comparemodels: $cmd">&2
$cmd
