#!/usr/bin/python2

import codecs
import glob
import json

import asf_new as asf  # dummy asf_new.py. To be replace just by asf

fmask = 'words_*.asf'
omask = 'words_{}_{}-{}'  # Mask for the word

# ASF items confersion
iconv = {'phone'         : unicode,
         'word'          : unicode,
         'prosodeme'     : unicode,
         'concatenate'   : unicode,
         'suspicious'    : unicode,
         'pphrsBoundPos' : unicode,
         'pwordBoundPos' : unicode,
         'mlfEndTime'    : float,
         'mlfBegTime'    : float,
         'HTKScore'      : unicode,  # Not used as number here
        }


def main() :

    # Nacte ASF soubory
    # - co soubor, to stejne slovo
    for fname in glob.glob(fmask) :

        # Read the ASF data
        asfdata = asf.ASF(attrconv = iconv)
        asfdata.read_file(fname)
        # Individual middle-phone splits for the given word
        splits  = {}

        # Jmeno je foneticky, ale ma u sebe prefix a suffix, ktery se musi opravit
        word    = fname
        word    = word.replace('words_', '') # Odstran zacatek
        word    = word.replace('.asf',  '') # Odstran konec
        phntext = '|{}|'.format(word)

        # Get individual concatenation parts
        for sent,segs in asfdata.iteritems() :
            # Find the split unit
            split,_ = segs.find_exact('concatenate', '*><*')
            part1   = segs[0:split+1]
            part2   = segs[split:]
            # Convert them to diphones
            part1   = to_diphs(part1, sent)
            part2   = to_diphs(part2, sent)

            # Store the parts
            splits[sent] = (part1, part2)

        # Zkombinuj ruzne casti slova do JSON struktury
        for sent1,(part1,part2) in splits.iteritems() :
            for sent2,(partA,partB) in splits.iteritems() :
                # Ignore the matching parts
                if sent1 == sent2 :
                   continue

                # Output file
                ofile = omask.format(word, sent1, sent2)

                # Join the parts
                join  = part1 + partB
                # Add first and final pause
                join.insert(0, {'diphone' : '${}'.format(join[ 0]['l-phone'])})
                join.append(   {'diphone' : '{}$'.format(join[-1]['r-phone'])})

                # Build JSON from it
                units = []
                data  = {'phntext' :  phntext,
                         'phrtype' :  to_phrtype(join),
                         'phrlctx' : '$',
                         'phrrctx' : '$',
                         'units'   :  units,
                         'File'    :  ofile + '.wav',
                        }
                # Add units
                for u in join :
                    u =  dict(u)  # Make copy. Will be changed
                    d =  u['diphone']
                    # Remove all not-required keys
                    for k in set(u.keys()).difference(('BegTime', 'EndTime', 'Sentence')) :
                        del u[k]
                    # Add to JSON
                    item = {'name'  :  d, }
                    # Add the info for cndidate to be kep, if there is such
                    if u :
                       item['keep'] = (u, )
                    # Add it to the data
                    units.append(item)

                # Store the JSON to disc. Wrap data to a sequence as JSON must be a sequence
                json.dump((data, ), codecs.open(ofile + '.json', 'wt', 'utf8'), sort_keys = True, indent = 4)
#                # Add it to the main json
#                sdata.append(data)

#    # Store the JSON to disc
#    json.dump(sdata, codecs.open(ojson, 'wt', 'utf8'), sort_keys = True, indent = 4)


## Posloupnost fonu prevede do posloupnosti difonu
#
def to_diphs(units, sent) :
    # Copy
    uout = []
    # Build diphones, copying the base features required for JSon
    for u1,u2 in zip(units[:-1], units[1:]) :
        uout.append({'Sentence'  :  sent,
                     'BegTime'   : (u1['mlfBegTime'] + u1['mlfEndTime']) / 2.0,  # Middle of 1st phone
                     'EndTime'   : (u2['mlfBegTime'] + u2['mlfEndTime']) / 2.0,  # Middle of 2nd phone
                     'diphone'   :  u1['phone']      + u2['phone'],
                     'l-phone'   :  u1['phone'],
                     'r-phone'   :  u2['phone'],
                     'prosodeme' :  u2['prosodeme'],
                    })
    # Vrati seznam
    return uout

## Z posledni ne-pauza jednotky urci typ fraze
#
def to_phrtype(units) :
    for u in reversed(units) :
        if   u.get('phone', ' ')       in '$%#' :
             continue
        if   u.get('diphone', ' ')[-1] in '$%#' :
             continue
        # Get the type
        p  = u.get('prosodeme', '?? NOT SET ??')
        # Convert the type
        if   p == '3.1' : return '3'
        elif p == '1.1' : return '1'
        elif p == '2.2' : return '2'
        elif p == '0'   : return '0'
        else            : raise ValueError('Unknown prosodeme {} for unit {}'.format(p, u))

    # Not found?
    raise ValueError('Unknown prosodeme for units {}'.format(units))

#
# ---------
#
if __name__ == '__main__' :
     main()
