1
|
#!/usr/bin/python
|
2
|
# -*- coding: utf-8 -*-
|
3
|
|
4
|
# --------------------------------------------------------------- #
|
5
|
# Library for a simple processing of ASF files. #
|
6
|
# --------------------------------------------------------------- #
|
7
|
# 2011 - 2016 Zdenek Hanzlicek (zhanzlic@ntis.zcu.cz) #
|
8
|
# NTIS, University of West Bohemia #
|
9
|
# --------------------------------------------------------------- #
|
10
|
|
11
|
# SVN $Id: asflight.py 1756 2016-04-15 13:39:57Z zhanzlic $
|
12
|
|
13
|
|
14
|
import codecs
|
15
|
import os
|
16
|
import os.path
|
17
|
|
18
|
|
19
|
class AsfLight:
|
20
|
|
21
|
def __init__( self, file_name=None, code_page='utf-8' ):
|
22
|
|
23
|
self.utts = dict() # particular utterances
|
24
|
self.attrib_order = list() # order of attributes for all utterances
|
25
|
self.header = list() # list of lines with comments
|
26
|
|
27
|
if file_name is not None:
|
28
|
self.read( file_name )
|
29
|
|
30
|
|
31
|
## ==========----------
|
32
|
|
33
|
|
34
|
def __len__( self ):
|
35
|
return len( self.utts )
|
36
|
|
37
|
|
38
|
## ==========----------
|
39
|
|
40
|
|
41
|
def __iter__( self ):
|
42
|
return iter( self.utts )
|
43
|
|
44
|
|
45
|
## ==========----------
|
46
|
|
47
|
|
48
|
def __setitem__( self, key, value ):
|
49
|
self.utts[ key ] = value
|
50
|
|
51
|
|
52
|
## ==========----------
|
53
|
|
54
|
|
55
|
def __getitem__( self, key ):
|
56
|
return self.utts[ key ]
|
57
|
|
58
|
|
59
|
## ==========----------
|
60
|
|
61
|
|
62
|
def __contains__( self, item ):
|
63
|
return item in self.utts
|
64
|
|
65
|
|
66
|
## ==========----------
|
67
|
|
68
|
|
69
|
def read( self, file_name, code_page='utf-8' ):
|
70
|
|
71
|
asf_handle = codecs.open( file_name, 'rt', code_page )
|
72
|
asf_content = asf_handle.readlines()
|
73
|
asf_handle.close()
|
74
|
|
75
|
for asf_line in asf_content:
|
76
|
asf_line = asf_line.strip()
|
77
|
|
78
|
if asf_line == "": # empty line
|
79
|
continue
|
80
|
|
81
|
if asf_line.startswith("#"): # header / comment
|
82
|
self.header.append( asf_line )
|
83
|
|
84
|
elif asf_line.startswith('"') and asf_line.endswith('"'): # new utterance
|
85
|
utt_name = asf_line[1:-1]
|
86
|
utt_units = list()
|
87
|
self.utts[ utt_name ] = utt_units
|
88
|
|
89
|
elif asf_line.startswith("|") and asf_line.endswith('|'): # unit
|
90
|
attrib_vals = [ attrib_val.strip() for attrib_val in asf_line[1:-1].split("|") ]
|
91
|
utt_units.append( { self.attrib_order[ idx ]:attrib_vals[ idx ] for idx in range( len( self.attrib_order ) ) } )
|
92
|
|
93
|
elif asf_line.startswith("[") and asf_line.endswith("]"): # list of attribute names
|
94
|
self.attrib_order = [ attrib_name.strip() for attrib_name in asf_line[1:-1].split("|") ]
|
95
|
|
96
|
|
97
|
## ==========----------
|
98
|
|
99
|
|
100
|
def write( self, file_name, code_page='utf-8' ):
|
101
|
|
102
|
asf_handle = codecs.open( file_name, 'wt', code_page )
|
103
|
|
104
|
if len( self.header ):
|
105
|
asf_handle.write( "\n".join( self.header ) )
|
106
|
asf_handle.write( "\n\n" )
|
107
|
|
108
|
attrib_lens = { attrib_name:len( attrib_name ) for attrib_name in self.attrib_order }
|
109
|
|
110
|
# get the maximum lenghts for particular attributes
|
111
|
for units in self.utts.itervalues():
|
112
|
for unit in units:
|
113
|
for attrib_name in unit:
|
114
|
|
115
|
attrib_val = unit[ attrib_name ]
|
116
|
if not isinstance( attrib_val, unicode ):
|
117
|
attrib_len = len( unicode( attrib_val ) )
|
118
|
else:
|
119
|
attrib_len = len( attrib_val )
|
120
|
|
121
|
if attrib_lens[ attrib_name ] < attrib_len:
|
122
|
attrib_lens[ attrib_name ] = attrib_len
|
123
|
|
124
|
# write list of attribute names
|
125
|
asf_handle.write( "[ " + " | ".join( [ attrib_name + " "*( attrib_lens[ attrib_name ] - len( attrib_name ) ) for attrib_name in self.attrib_order ] ) + " ]\n\n" )
|
126
|
|
127
|
for utt_name in sorted( self.utts.iterkeys() ):
|
128
|
asf_handle.write( '"' + utt_name + '"\n' )
|
129
|
|
130
|
for unit in self.utts[ utt_name ]:
|
131
|
attrib_vals = { attrib_name:( unit[ attrib_name ] if isinstance( unit[ attrib_name ], unicode ) else unicode( unit[ attrib_name ] ) ) for attrib_name in self.attrib_order }
|
132
|
asf_handle.write( "| " + " | ".join( [ attrib_vals[ attrib_name ] + " "*( attrib_lens[ attrib_name ] - len( attrib_vals[ attrib_name ] ) ) for attrib_name in self.attrib_order ] ) + " |\n" )
|
133
|
|
134
|
asf_handle.write( "\n" )
|
135
|
|
136
|
asf_handle.close()
|
137
|
|