#!/usr/bin/python2
# -*- coding: utf-8 -*-
#
# Projde ASF soubor a vytahne z nej vsechna slova (a prozodicka slova - zatim jen velmi zjednodusena implemntace detekce!). Ze ziskanych
# slov (a p-slov) vytvori datove soubory pro MATLAB, ktere se pak pouziji pro pokusy s mernim precepcni podobnosti techto slov.
#


import copy
import re
import os.path
import sys

import mlf
import asf
import pm
import wavext


## Fonem, ktery ma byt uprostred
phone                = 'U'

## Klic urcujici index slova
wordattr_index       = "index"

## Klic urcujici fonetickou reprezentaci slova
wordattr_phones      = "phones"
## Klic urcujici jednotky v danem slove, jednotky typu dict s atributy nactenymiz MLF
wordattr_units       = "units"
## Klic urcujici vetu z niz slovo pochazi
wordattr_sentence    = "sentence"

## Pole pitch-marku odpovidajici danemu slovu [pm.OnePn, ...] (jeden kanal)
wordattr_pmarks      = "pmarks"
## Pole recovych vzorku odpovidajici danemu slovu [int, ...]. Signal neni nijak upraven (zadne vazeni na zacatku/konci)
wordattr_speech      = "speech"
## Vzorkovaci frekvence (int`)
wordattr_sampfreq    = "sf"

## Pole recovych vzorku PRED vzorky slova [int, ...]. Signal neni nijak upraven (zadne vazeni na zacatku/konci)
#  Signal slova plus kontext se vytvori prostym spojenim poli: data[wordattr_speech_lctx] + data[wordattr_speech]
wordattr_speech_lctx = "speech_lctx"
## Pole recovych vzorku ZA vzorky slova [int, ...]. Signal neni nijak upraven (zadne vazeni na zacatku/konci)
#  Signal slova plus kontext se vytvori prostym spojenim poli: data[wordattr_speech] + data[wordattr_speech_rctx]
wordattr_speech_rctx = "speech_rctx"

## Jmeno ASF souboru, z nehiz se vytvori seznam slov - globalni promenna POUZE pro cteni (naplnena jen jednou v main)
asf_fname            = None
## Cesta k .pm souborum - globalni promenna POUZE pro cteni (naplnena jen jednou v main)
pmk_dpath            = None
## Cesta k .wav souborum - globalni promenna POUZE pro cteni (naplnena jen jednou v main)
wav_dpath            = None

## Atribut, ktery v MLF (nebo ASF) souboru obsahuje jmeno jednotky. Defaultne nastaven na Mlf.attr_modelName
mlfattr_modelName    = mlf.Mlf.attr_modelName
## Atribut, ktery v MLF (nebo ASF) souboru obsahuje typ prozodemu.
mlfattr_prosodmType  = "prosodemType"



#
#  -------- MAIN --------
#
def main() :

    # Set into global variable ...
    global mlfattr_modelName
    global asf_fname
    global pmk_dpath
    global wav_dpath


    # Vypis priklad pouziti ...
    # vyparsuj argumenty
    # TODO: dodelat!!!
    asf_fname = "/home/dtihelka/Experiments/CCfeats_1class-Classifier/features/spkr_AJ/corpus.rev563.asf"
    wav_dpath = "/home/dtihelka/mnt_ArticServer/Projects/cz/anderle_jan/data/non-mastered/zkracene-pauzy/speech/"
    pmk_dpath = "/home/dtihelka/mnt_ArticServer/Projects/cz/anderle_jan/data/non-mastered/zkracene-pauzy/pitch-marks/"
    wav_fext  = ".wav"
    pmk_fext  = ".pm"
    out_dpath = "/home/dtihelka/Experiments/CCfeats_1class-Classifier/negative_examples/"


    # -------
    # Nacti ASF soubor
    print "Nacitam data z ASF souboru: %s" % asf_fname
    asf_data  = asf.ASF(asf_fname)
    # Jmeno atributu s jednotkami ...
    mlfattr_modelName = asf_data.get_mlf2asf_attribmap()[mlf.Mlf.attr_modelName]

    # -------
    # Vytvor hash obsahujici vsechna slova (i prozodicka) a jejich instance z MLF
    print "Vytvarim seznam slov ..."
    words     = get_words(asf_data)
    pwords    = get_pwords(asf_data)
    # ASF uz nebude treba
    asf_data  = None
    # Sluc slova do jednoho pole
#    words     = merge_words(words, pwords)
#    pwords    = None
    # Vypis info
    print "Bylo ziskano   %d slov (vcetne pros. slov)" % len(words.keys())


    # -------
    # Filtruj slova
    print "Filtruji slova ..."

    #####
    #####
    #words = filter_word_occurs(words, word_phnlen_min = 8, word_occurs_max = 15)
    #print_prosodemestats(words, {"0" : 6})
    #print_prosodemestats(words, {"0" : 5, "1" : 4})
    #print_prosodemestats(words, {"0" : 5, "3" : 4})
    #print_prosodemestats(words, {"1" : 4, "3" : 4})
    #print_prosodemestats(words, {"0" : 5, "1" : 4, "3" : 4})
    #print_prosodemestats(words, __NAHRAD_SEDEM__)
    ###
    #words = filter_word_occurs(words, word_phnlen_min = 8, word_occurs_min = 5)
    #print_prosodemestats(words, {"1" : None, "3" : None})
    #print_prosodemestats(words, {"0" : None, "1" : None, "3" : None})
    #return
    #####
    #####

    words = filter_word_occurs(words, word_phnlen_min = 8, word_occurs_min = 3)
    #words = filter_word_texts( words, ("spoluprAce",   "vminulosTi", "prUmislovIx", "viSetRovAJI", "informacI",  "zAkazJIkU",
    #                                       "spoRitelni",   "potvrzuje",  "projektu",    "!opatReJI",   "zAleZitost", "!ekonomiki",
    #                                       "zamJestnancU", "kancelARe",  "primAtora",   "zAstupce",    "veRejnosTi", "hospodARstvI",
    #                                       "pozornost",    "policistU",  "sJemovni",    "ZelezJiCJI",  "republice",   "nAmJesTI,
    #                                       "nemUZeme"))
    #words = filter_word_texts( words, ("nemUZeme", "novinARUm",  "republice",  "rospoCtu",  "zdUrazJil",
    #                                       "problEmU", "nasvjeTe",   "konkurence", "Clovjeka",  "potravin"))
#    words = filter_word_texts(words, ("spoluprAce",   "prUmislovIx", "potravin",  "zAkazJIkU", "vminulosTi", "nasvjeTe",
#                                      "novinARUm",    "nemUZeme",    "pozornost", "informacI", "konkurence", "Clovjeka",
#                                      "hospodARstvI", "republice",   "rospoCtu",  "problEmU",  "zdUrazJil",  "!UsmJevem"))
    words = filter_word_midphon(words, phone)
    print "Bylo ponechano %d slov (vcetne pros. slov)" % len(words.keys())

    # Ulozi do ASF
    store_to_asf(words, out_dpath)

#    k = words.keys()[0]
#    print 'slovo:', k
#    print 'data:',  words[k][0]

    # Vyfiltruj podle pozadovaneho fonemu


    return



    # -------
    # Ukladam slova
    print "Pridavam ke slovum recova data a pitch-marky"

    # Pridej ke slovum ostatni atributy pro ulozeni
    words = add_word_attribs(words, pmk_dpath, pmk_fext, wav_dpath, wav_fext, wav_cntx = 0.8)

    # A uloz je
    print "Ukladam slova ..."
    store_to_mfile(words, out_dpath)
#    store_to_asf  (words, out_dpath)
    store_to_wav  (words, out_dpath)

    # Hotovo ...
    print "Hotovo."






##
# Z ASF souboru vypreparuje seznam vsech slov a jim odpovidajicich jednotek
#
# @param  asfdata trida s daty z ASF souboru (instance asf.ASF). Musi obsahovat (krom libovolnych jinych) sloupce, ktere tvori platny
#         MLF soubor (casy, jmena jednotek, slova, pradvepodobnost neni nutna ...)!
# @return hlubokou (!) kopii vsech nalezenych slov a jejich atributu:
#         {slovo : [{wordattr_units : [{dict z MLF}, ...], wordattr_sentence : string}, ...], slovo : [...]
#
def get_words(asfdata) :

    # Obal ASF tridu MLF tridou (nedelej kopii, data se nemeni)
    wordlst = {}
    mlfdata = asf.ASF()  # TODO: change to mlf.Mlf
    mlfdata.from_asf(asfdata, deep_copy = False)

    # Mame fony jako jednotky?
    segtype = mlfdata.get_comment_attrib("unitType")
    if segtype != "phone" :
       print "\nPOZOR: Neznamy typ jednotek '%s' (ocekavany jsou fony)\n\n" % str(segtype)

    # Vypreparuj vsechna slova z mlf a uloz po dole hashu ...
    for sentence, words in mlfdata.word_insts(mlfdata.word_occs().keys()).items() :
        for w in words :
            # Text slova
            wordtext = w['trans']
            wordsegs = w['segments']

            # Je slovo uz v hashi?
            if not wordlst.has_key(wordtext) :
               wordlst[wordtext] = []
            # Pridej slovo do seznamu. Jmena atributu urci mlf.Mlf, ale nejsou tam definovane jako konstanty
            wordlst[wordtext].append({wordattr_units : wordsegs, wordattr_sentence : sentence})

            # Otestuj, jestli se text vytvoreny ze jmen jednotek shoduje s textem v atributu 'text'
            # Pouze pro jistotu (vim, jsem paranoidni ...)
            segtext  = ''.join([u[mlfdata.attr_modelName] for u in wordsegs])
            if wordtext != segtext :
               raise Exception, "Neshoda textu slova z MLF '%s' a textu se segmentu jednotek '%s'" % (wordtext, segtext)

    # Vrat kopii
    return copy.deepcopy(wordlst)


##
# Z ASF souboru vypreparuje seznam vsech prozodickych slov a jim odpovidajicich jednotek
#
# @param  asfdata trida s daty z ASF souboru (instance asf.ASF). Musi obsahovat (krom libovolnych jinych) sloupce, ktere tvori platny
#         MLF soubor (casy, jmena jednotek, slova, pradvepodobnost neni nutna ...)!
# @return hlubokou (!) kopii vsech nalezenych prozodickych slov a jejich atributu:
#         {slovo : [{wordattr_units : [{dict z MLF}, ...], wordattr_sentence : string}, ...], slovo : [...]
#
def get_pwords(asfdata) :

    # Zkopiruj data ...
#    data = copy.deepcopy(asfdata)
    mlfdata = asf.ASF()  # TODO: change to mlf.Mlf
    mlfdata.from_asf(asfdata, deep_copy = True)
    # The sequence of words in the prosodic word
    words   = None
    wordbeg = None

    # Vytvori prozodicka slova, pokud uz v ASF nejsou. Vyuzije se priznak "pwordBoundPos"
    for sentence, units in mlfdata.getutts().items() :
        for i,unit in enumerate(units) :
            # If the unit is pause, just ignore it
            if   unit['pwordBoundPos'] in 'PS' :
                 continue
            # If the unit is first in the prosodic word, remember it and add its word ()
            elif unit['pwordBoundPos'] == 'F' :
                 words   = [unit['word'], ]
                 wordbeg = i
            # If the unit is last in the prosodic word, fill the prosodic word
            elif unit['pwordBoundPos'] == 'L' :
                 # Create the string
                 pword = '|'.join([w for w in words if w != None])
                 words = None
                 # Store to ASF
                 mlfdata.set_attrib(sentence, wordbeg, 'pword', pword)
            else :
                 words.append(unit.get('word', None))

    # Zmen sloupec se slovy na prozodicka slova ... Trosku hack, ale pak muzeme vyuzit uz existujici kod pro "normalni" slova
    mlfdata.add_asf2mlf_attribmap("pword", mlf.Mlf.attr_word)

    # Vrat seznam prozodickych slov ..
    return get_words(mlfdata)


##
# Udela sjednoceni slov - pokud je stejne slovo v obou hashich, pak udelej sjednoceni atributu, jinak kopiruj cela slova
#
# @param
# @param
# @return novou hash s hlubokymi (!) kopiemi sjednoceni slov
#
def merge_words(words1, words2) :

    ## Testuje, jestli je instance slova 'word' obsazena v poli 'words'. Instance slov jsou zadana jako hash parametru
    def contains_word(words, word) :
        return True in [          w[wordattr_sentence]                                 ==           word[wordattr_sentence]                                 and \
                              len(w[wordattr_units])                                   ==       len(word[wordattr_units])                                   and \
                        int(float(w[wordattr_units][0 ][mlf.Mlf.attr_begTime])*100000) == int(float(word[wordattr_units][0 ][mlf.Mlf.attr_begTime])*100000) and \
                        int(float(w[wordattr_units][-1][mlf.Mlf.attr_endTime])*100000) == int(float(word[wordattr_units][-1][mlf.Mlf.attr_endTime])*100000) for w in words]

    # Zkopiruj prvni hash
    merged = words1
    # Prochazej druhou hash a pridavej do ni slova
    for word_text, word_cands in words2.iteritems() :
        # Pokud slovo neni v hashi, zkopiruj jej cele, jinak pridej jen kandidaty, kteri jeste nejsou
        if not merged.has_key(word_text) : merged[word_text] =                  word_cands
        else                             : merged[word_text].extend([w for w in word_cands if not contains_word(merged[word_text], w)])

    # Vrat vyslednou hash
    return copy.deepcopy(merged)



##
# Zachova jen slova s pozadovanym minimalnim poctem vyskytu a minimalni delkou
#
# @param  words       hash se slovy, primo tato hash se upravuje!
# @param  word_phnlen_min minimalni delka slova ve fonech (int). Pokud neni definovan, neuvazuje se
# @param  word_phnlen_max maximalni delka slova ve fonech (int). Pokud neni definovan, neuvazuje se
# @param  word_occurs_min minimalni pocet vyskytu slova (int). Pokud neni definovan, neuvazuje se
# @param  word_occurs_max maximalni pocet vyskytu slova (int). Pokud neni definovan, neuvazuje se
# @return upravenou hash 'words' (primo, nikoli kopie)
#
def filter_word_occurs(words, word_phnlen_min = 0, word_phnlen_max = sys.maxint, word_occurs_min = 0, word_occurs_max = sys.maxint) :

    # Projdi slova
    for word_text, word_cands in words.items() :
        if len(word_text) < word_phnlen_min or len(word_text) > word_phnlen_max or len(word_cands) < word_occurs_min or len(word_cands) > word_occurs_max :
           del words[word_text]

    # Vrati vysledek
    return words

##
# Zachova jen slova, ktera obsahuji uprostred pozadovany fon s pozadovanym minimalnim poctem vyskytu a minimalni delkou
#
# @param  words hash se slovy, primo tato hash se upravuje!
# @param  phone pozadovany fon, ktery se hleda zhruba uprosted slova
# @return upravenou hash 'words' (primo, nikoli kopie)
#
def filter_word_midphon(words, phone) :

    # Projdi slova
    for word_text in words.keys() :
        # Interval okolo prestredniho fonu
        mid = len(word_text)/2.0
        beg = int(mid - 1.0)
        end = int(mid + 1.0)
        # Je tam pozadovany fon?
        if not phone in word_text[beg:end+1] :
           del words[word_text]
           continue

        # Mark the phone otherwise
        indx = word_text.find(phone, beg, end+1)
        # Vsechna slova
        for word in words[word_text] :
            segs =  word['units']
            # Add feature
            segs[indx]['concatenate'] = '*><*'

    # Vrati vysledek
    return words

##
# Zachova jen pozadovana slova
#
# @param  words hash se slovy, primo tato hash se upravuje!
# @param  texts pole slov ktera se maji zachovat [string, ...]
# @return upravenou hash 'words' (primo, nikoli kopie)
#
def filter_word_texts(words, texts) :

    # Projdi slova
    for word_text in words.keys() :
        if not True in [word_text == t for t in texts] :
           del words[word_text]

    # Vrati vysledek
    return words

##
# Ke kazdemu slovu prida priznaky 'wordattr_pmarks', 'wordattr_speech' a 'wordattr_sampfreq', ostatni priznaky zustanou nezmeneny!
#
# @param  words hash se slovy, primo tato hash se upravuje!
# @param
# @param
# @param
# @param
# @param  wav_cntx pocet sekund, kolik recoveho signalu se ulozi pred a za slovem
# @return upravenou hash 'words' (primo, nikoli kopie)
#
def add_word_attribs(words, pmk_dpath, pmk_fext, wav_dpath, wav_fext, wav_cntx = None) :

    # Projdi slova a vsechny jejich kandidaty
    for word_text,  word_cands in words.items() :
        print "     ---> pridavam atributy pro slovo %s (%d kandidatu)" % (word_text, len(word_cands))

        for indx,cand in enumerate(word_cands) :

            # Nacti pitch-marky a rec
            pmk_file = pm.Pm(         os.path.join(pmk_dpath, cand[wordattr_sentence] + pmk_fext), pm.Pm.shift_nearest)
            wav_file = wavext.WavRead(os.path.join(wav_dpath, cand[wordattr_sentence] + wav_fext))

            # Casy zacatku a koncu vsech jednotek ve slovech zarovnej na pitch-marky. Pokud uz jsou zarovnane, nic se vlastne
            # nezmeni
            for u in cand[wordattr_units] :
                u[mlf.Mlf.attr_begTime] = pmk_file.find_pmk(float(u[mlf.Mlf.attr_begTime]), skip_T = True).get_time()
                u[mlf.Mlf.attr_endTime] = pmk_file.find_pmk(float(u[mlf.Mlf.attr_endTime]), skip_T = True).get_time()

            # Zacatek a konec slova
            beg_time = float(cand[wordattr_units][ 0][mlf.Mlf.attr_begTime])
            end_time = float(cand[wordattr_units][-1][mlf.Mlf.attr_endTime])

            # Vzorkovaci frekvence
            sampfreq = wav_file.getframerate()
            wavlengt = wav_file.getnframes() / float(sampfreq)

            # Napln pole recovych dat a pitch-marku pro cele slovo.
            # Uprav pole pitch-marku tak, aby zacinalo na 0, zaroven preskoc T pitch-marky. Pridej 1 - matlab cisluje od 1
            wav_data = tuple( wav_file[int(beg_time * sampfreq) : int(end_time * sampfreq) +1])
            pmk_data = tuple([pm.OnePm((p.get_time() - beg_time) * sampfreq +1, p.get_type()) for p in pmk_file.get_pmks(beg_time, end_time) if p.get_type() != p.type_transitional])

            # Uloz do atributu slova
            cand[wordattr_pmarks]   = pmk_data
            cand[wordattr_speech]   = wav_data
            cand[wordattr_sampfreq] = sampfreq
            cand[wordattr_index]    = indx +1   # Index from 1

            # Jeste pridej kontext, pokud je pozadovan
            if wav_cntx and wav_cntx > 0.0 :
               # Posun casy
               ctx_beg = beg_time - wav_cntx
               ctx_end = end_time + wav_cntx
               # Nejsou mimo rozsah?
               if ctx_beg < 0.0      : ctx_beg = 0.0
               if ctx_end > wavlengt : ctx_end = wavlengt
               # A napln data
               cand[wordattr_speech_lctx] = tuple(wav_file[int(ctx_beg  * sampfreq) : int(beg_time * sampfreq)])
               cand[wordattr_speech_rctx] = tuple(wav_file[int(end_time * sampfreq) : int(ctx_end  * sampfreq)])


    # Vrat vysledek
    return words


##
# Hash slov zapise do matlabovskych m-filu (jeden soubor na slovo, obsahuje vsechny kandidaty daneho slova)
#
# @param
# @param
#
def store_to_mfile(words, out_dpath) :

    # Vsechna slova
    for word_text, word_cands in words.items() :

        # Jmeno matlabovske funkce
        funct = word_text + "_data"

        # Zaloz novy m-file, bude se jmenovat stejne, jako je text slova
        mfile = open(os.path.join(out_dpath, funct + ".m"), "wt")
        # Vytvor jeho hlavicku
        mfile.write("function data = %s()\n" % funct)
        mfile.write("%\n")
        mfile.write("%% Funkce vrati data vsech vyskytu slova:          '%s'\n" % word_text)
        mfile.write("%% Data vyskytu slov jsou vygenerovana ze souboru: '%s'\n" % asf_fname)
        mfile.write("%% Data pitch-marku pochazi z adresare:            '%s'\n" % pmk_dpath)
        mfile.write("%% Recova data pochazi z adresare:                 '%s'\n" % wav_dpath)
        mfile.write("%% Soubor byl vytvoren automaticky skriptem:       '%s'\n" % sys.argv[0])
        mfile.write("%\n")
        mfile.write("% Vracena data jsou cell pole {i,j}, kde i je index slova a j je index dat v poradi:\n\n")
        mfile.write("%    1 - veta, ze ktere slovo pochazi\n")
        mfile.write("%    2 - casy zacatku slova v souboru [sec]\n")
        mfile.write("%    3 - cas konce slova v souboru [sec]\n")
        mfile.write("%    4 - pole recovych vzorku (short) odpovidajici slovu (zarovnane na p-mark, nijak nevazene)\n")
        mfile.write("%    5 - vzorkovaci frekvence (int cislo)\n")
        mfile.write("%    6 - pole pitch-marku jako indexy do recovych vzorku [1, ..., length(vzorky)]\n")
        mfile.write("%    7 - pole typu pitch-marku (retezec znaku)\n")
        mfile.write("%    8 - casy zacatku fonemu daneho slova jako indexy do recovych vzorku [1, ...]\n")
        mfile.write("%    9 - casy koncu fonemu daneho slova jako indexy do recovych vzorku [..., length(vzorky)]\n")
        mfile.write("%\n")
        mfile.write("\n\n")

        # Vykonny kod - predvytvor cell pole, ktere se bude vracet
        mfile.write("data = [\n")

        # ------
        # Zpracuj slovo po slovu a data uloz do m-filu
        for cand in word_cands :

            # Zacatek a konec slova
            beg_time = float(cand[wordattr_units][ 0][mlf.Mlf.attr_begTime])
            end_time = float(cand[wordattr_units][-1][mlf.Mlf.attr_endTime])
            sampfreq =       cand[wordattr_sampfreq]

            # Napln pole zacatku a koncu fonu ve slove, uprav tak, ze bude obsahovat rovnou index recoveho vzorku.
            # Pridej 1 - matlab cisluje od 1
            beg_phns = [int((float(u[mlf.Mlf.attr_begTime]) - beg_time) * sampfreq) +1 for u in cand[wordattr_units]]
            end_phns = [int((float(u[mlf.Mlf.attr_endTime]) - beg_time) * sampfreq) +1 for u in cand[wordattr_units]]
            # Jmeno souboru
            name     = "%02d_%s" % (cand[wordattr_index], cand[wordattr_sentence])

            # Uloz vse do m-filu
            mfile.write("        {")
            mfile.write("  '%s'  %9.5f  %9.5f" % (name, beg_time, end_time))
            mfile.write("  " + str(list(cand[wordattr_speech])))                                          # Recove vzorky, zkopiruj do listu: vyuzij zde (i dale) to, ze python tiskne str([]) nako "[v1, v2, ...]"
            mfile.write("  " +      str(sampfreq))                                                        # Vzorkovaci frekvence
            mfile.write("  " + str([int(p.get_time()) for p in cand[wordattr_pmarks]]))                   # Casy jako pole integeru (preved na int ale pricti 0.5 aby se odstranily aritmeticke chyby ...)
            mfile.write("  " + str([    p.get_type()  for p in cand[wordattr_pmarks]]).replace('"', "'")) # Typy jako pole znaku, je ale treba nahrasit uvozovky apostrofem
            mfile.write("  " + str(beg_phns))
            mfile.write("  " + str(end_phns))
            mfile.write("};\n")

        # Ukonci data v m-filu a zavri soubor
        mfile.write("       ];\n\n")
        mfile.close()


##
# Hash slov zapise do ASF souboru (jeden soubor na slovo, obsahuje vsechny kandidaty daneho slova)
#
# @param
# @param
#
def store_to_asf(words, out_dpath) :

    ## Nova ASF trida
    #asfdata = asf.ASF()
    #asfdata.set_coment(('',
                        #'Seznam slov v souboru:',
                        #'\n'.join('    ' + w for w in words.iterkeys()),
                      #))

    # Vsechna slova
    for word_text, word_cands in words.items() :
        # Nova ASF trida
        asfdata = asf.ASF()
        # Vsechny kandidaty
        for cand in word_cands :
            sentence = cand[wordattr_sentence]
            # Prida vetu, neni-li jeste ulozena
            if not asfdata.getutts().has_key(sentence) :
               asfdata.append_utt(sentence)
            # A pridej jednotlive priznaky jednotek do teto vety
            for u in cand[wordattr_units] :
                asfdata.add_attribs(u, sentence)

        # Uloz ASF do souboru
        asfdata.write_asf(os.path.join(out_dpath, 'words_{}.asf'.format(word_text)))


##
# Rozsireny signal slov z hashe slov (wordattr_speech_ctx) zapise do wav souboru (jeden soubor na slovo a kandidata). Take
# ulozi .lab soubory citelne ve wavesurferu s informaci o hranicich jednotek.
#
# @param
# @param
#
def store_to_wav(words, out_dpath) :

    # Vsechna slova a vsechny kandidaty
    for word_text, word_cands in words.items() :
#        for cand_data, cand_indx in [(word_cands[i], i) for i in range(len(word_cands))] :
        for cand_data in word_cands :
            # Jmeno souboru
#            fname  = word_text + "_%02d_%s_to_listen" % (cand_indx +1, cand_data[wordattr_sentence])
            fname  = word_text + "_%02d_%s_to_listen" % (cand_data[wordattr_index], cand_data[wordattr_sentence])

            # Vytvor wav (s kontextem!). Proste spojeni MUSI dat cele slovo, jak je psano v dokumentaci ;-)
            # napln cas, kdy zacita skutecne slovo
            sampdata = cand_data[wordattr_speech_lctx] + cand_data[wordattr_speech] + cand_data[wordattr_speech_rctx]
            sampfreq = cand_data[wordattr_sampfreq]
            # Uloz wav
            wavext.WriteWav(os.path.join(out_dpath, fname + ".wav"), sampdata, sampfreq, 1)

            # Vytvor data pro .lab soubor. Obsahuje casy hranic fonu a jejich labely
            lctxlen  =   len(cand_data[wordattr_speech_lctx]) / float(cand_data[wordattr_sampfreq])
            begtime  = float(cand_data[wordattr_units][0][mlf.Mlf.attr_begTime])
            # Casy labelu posun tak, aby ukazovaly presne na zacatek slova (tedy ZA kontext)
            labdata = ["%f %f %s" % (float(u[mlf.Mlf.attr_begTime]) -begtime +lctxlen, float(u[mlf.Mlf.attr_endTime]) -begtime +lctxlen, u[mlfattr_modelName]) for u in cand_data[wordattr_units]]

            # Uloz .lab soubor
            labfile = open(os.path.join(out_dpath, fname + ".lab"), 'wt')
            labfile.write("\n".join(labdata))
            labfile.close()


##
# Pro dane slovo vypise v jakych se nachazi pozicich
#
# @param words_dict hash se slovy vygenerovana timto projektem
# @param word_text text slova
#
def print_wordstats(words, word_text) :

    print "Informace pro slovo:    %s" % word_text
    print "    pocet realizaci:    %d" % len(words[word_text])
    print "--"
#    for p in words[word_text] :
#        print "    typ prozodemu:      %s" % p[attr_prosodemeType]
#        print "    delka prozodemu:    %d" % p[attr_prosodemeLen]
#        print "    pozice v prozodemu: %d" % p[attr_prosodemePos]
#        print "    veta:               %s" % p[attr_sentence]
#        print "--"


##
# Vypise vsechna slova ktera se vyskytuji v danych prozodemech
#
# @param words_dict hash se slovy vygenerovana timto projektem
# @param prosodemes hash {typ_prozodemu : min_vyskytu, ...}. Pokud je min_vyskytu None, berou se v potaz vsechny vyskyty
#
def print_prosodemestats(words, prosodemes) :

    # Prochazej slova
    for word_text, word_insts in words.items() :
        # Hash pozadovanych typu
        types = dict.fromkeys(prosodemes.keys(), [])
        # Projede vsechny instance slova a ukladej slova, ktera jsou v pozadovanem typu
        for word_data in word_insts :
            curr_type = dict.fromkeys([w[mlfattr_prosodmType] for w in word_data[wordattr_units] if w.has_key(mlfattr_prosodmType)], 0)
            # Ma slovo atribut typ prozodemu? Pokud ano, musi mit hash s aktualnim typem prozodemu prave jeden zaznam!
            if len(curr_type.keys()) != 1  :
               raise Exception, "Chybi atribut typu pros. slova '%s' ve slove '%s', nebo slovo patri do vice prozodemu %s" % (mlfattr_prosodmType, word_text, str(curr_type.keys()))
            # Nastav aktualni (ten jeden) typ prozodemu
            curr_type = curr_type.keys()[0]

            # Pokud je typ ten, ktery hledame, pridej slovo do pole. Je treba pridavat takto, jinak se prida ke vsem typum,
            # viz dict.fromkeys()
            if prosodemes.has_key(curr_type) :
               types[curr_type] = types[curr_type] + [word_data, ]

        #print [(key, prosodemes[key] <= len(types[key]), len(types[key])) for key in prosodemes.keys()]

        # Pokud existuji hodnoty, ktere nedosahly pozadovaneho poctu kandidatu, ignoruj
        if False in [prosodemes[key] <= len(types[key]) for key in prosodemes.keys()] :
           continue

        # Jinak vypis
        print "Informace pro slovo:    %s" % word_text
        print "    pocet realizaci:    %d" % len(word_insts)
        print "--"
        for type_key, insts in types.items() :
            print "    typ prozodemu:      %s" %     type_key
            print "    pocet realizaci:    %d" % len(insts)
#            print "    vety:               %s" % str([w[wordattr_sentence] for w in insts])
            print "--"



#
# --------- MAIN -------------
#
if __name__ == "__main__" :
   main()
