1
|
# coding: utf-8
|
2
|
|
3
|
import argparse
|
4
|
import codecs
|
5
|
import asflight
|
6
|
|
7
|
|
8
|
# monosyllabic prepositions + conjuctions a, i
|
9
|
proclitics_ver1 = [
|
10
|
u"a", u"i", u"bez", u"či", u"dle", u"do", u"k", u"ke", u"kol", u"ku",
|
11
|
u"na", u"nad", u"o", u"ob", u"od", u"po", u"pod", u"pro", u"před", u"přes", u"při",
|
12
|
u"s", u"se", u"skrz", u"u", u"v", u"ve", u"vně", u"z", u"za", u"ze", u"zpod" ]
|
13
|
|
14
|
# version #1 + monosyllabic relative pronouns
|
15
|
proclitics_ver2 = [
|
16
|
u"a", u"i", u"bez", u"či", u"dle", u"do", u"k", u"ke", u"kol", u"ku",
|
17
|
u"na", u"nad", u"o", u"ob", u"od", u"po", u"pod", u"pro", u"před", u"přes", u"při",
|
18
|
u"s", u"se", u"skrz", u"u", u"v", u"ve", u"vně", u"z", u"za", u"ze", u"zpod",
|
19
|
u"kdo", u"co", u"čí", u"jenž", u"jež", u"již", u"jíž", u"jichž", u"jimž",
|
20
|
u"němž", u"níž", u"nichž", u"jímž", u"kom", u"kým", u"čem", u"čím", u"čím", u"čích" ]
|
21
|
|
22
|
# list of enclitic pronouns
|
23
|
enclitics = [ u"se", u"si" ]
|
24
|
|
25
|
|
26
|
# unit keys: phone, pwordBoundPos, pphrsBoundPos, prosodeme, word
|
27
|
|
28
|
|
29
|
def set_pwords( unit_list, ver=1 ):
|
30
|
|
31
|
if ver == 1:
|
32
|
proclitics = proclitics_ver1
|
33
|
elif ver == 2:
|
34
|
proclitics = proclitics_ver2
|
35
|
|
36
|
# ----- process proclitics
|
37
|
|
38
|
unit_prev = None
|
39
|
append = False
|
40
|
|
41
|
for unit in unit_list:
|
42
|
|
43
|
if unit['pwordBoundPos'] == 'P':
|
44
|
if ( unit_prev is not None ) and ( unit_prev['pwordBoundPos'] != 'P' ):
|
45
|
unit_prev['pwordBoundPos'] = 'L'
|
46
|
append = False
|
47
|
|
48
|
elif ( unit['word'] is not None ) and ( unit['word'] != '.' ):
|
49
|
|
50
|
if append is False:
|
51
|
if ( unit_prev is not None ) and ( unit_prev['pwordBoundPos'] != 'P' ):
|
52
|
unit_prev['pwordBoundPos'] = 'L'
|
53
|
unit['pwordBoundPos'] = 'F'
|
54
|
|
55
|
else:
|
56
|
if ( unit_prev is not None ) and ( unit_prev['pwordBoundPos'] != 'P' ) and ( unit_prev['pwordBoundPos'] != 'F' ):
|
57
|
unit_prev['pwordBoundPos'] = "-"
|
58
|
unit['pwordBoundPos'] = "-"
|
59
|
|
60
|
append = ( unit['word'].lower() in proclitics )
|
61
|
|
62
|
unit_prev = unit
|
63
|
|
64
|
# ----- process enclitic
|
65
|
|
66
|
append = False
|
67
|
phr_end = False
|
68
|
|
69
|
for unit in reversed( unit_list ):
|
70
|
|
71
|
if ( unit['pphrsBoundPos'] == 'L' ):
|
72
|
phr_end = True
|
73
|
|
74
|
if append:
|
75
|
unit['pwordBoundPos'] = '-'
|
76
|
append = False
|
77
|
|
78
|
if phr_end and ( unit['word'] is not None ) and ( unit['word'] != "." ):
|
79
|
append = ( unit['word'].lower() in enclitics )
|
80
|
if append:
|
81
|
unit['pwordBoundPos'] = '-'
|
82
|
phr_end = False
|
83
|
|
84
|
# ----- correction of prosodemes
|
85
|
|
86
|
prosodeme = None
|
87
|
phr_end = False
|
88
|
|
89
|
for unit in reversed( unit_list ):
|
90
|
|
91
|
if ( unit['pphrsBoundPos'] == 'L' ):
|
92
|
prosodeme = unit['prosodeme']
|
93
|
|
94
|
if prosodeme is not None:
|
95
|
unit['prosodeme'] = prosodeme
|
96
|
|
97
|
if unit['pwordBoundPos'] == 'F':
|
98
|
prosodeme = None
|
99
|
|
100
|
elif ( unit['prosodeme'] != '0' ) and ( unit['prosodeme'] != 'X.X' ):
|
101
|
unit['prosodeme'] = '0'
|
102
|
|
103
|
|
104
|
# ----------
|
105
|
|
106
|
|
107
|
def main():
|
108
|
|
109
|
parser = argparse.ArgumentParser( description="Modify prosodic words in ASF file." )
|
110
|
|
111
|
parser.add_argument( type=str, metavar="ASF_IN", dest="asfIn", help="input ASF file" )
|
112
|
parser.add_argument( type=str, metavar="ASF_OUT", dest="asfOut", help="output ASF file" )
|
113
|
parser.add_argument( "-c", "--code-page", type=str, metavar="CODEPAGE", dest="codePage",
|
114
|
help="encoding of all files, default value: %(default)s", default='utf-8' )
|
115
|
|
116
|
args = parser.parse_args()
|
117
|
|
118
|
asf = asflight.AsfLight( args.asfIn, args.codePage )
|
119
|
|
120
|
for utt_name in asf:
|
121
|
set_pwords( asf[ utt_name ] )
|
122
|
|
123
|
asf.write( args.asfOut, args.codePage )
|
124
|
|
125
|
|
126
|
# ----------
|
127
|
|
128
|
# run the main program
|
129
|
if ( __name__ == "__main__" ):
|
130
|
main()
|