-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDOREMIFA2XML.PraatScript
407 lines (299 loc) · 16.1 KB
/
DOREMIFA2XML.PraatScript
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
#------------------------------------------------------------------------------------
# DOREMIFA2XML: XML Generator of the DoReMiFa project
# This Praat script READS INFORMATION FROM
# (i) AN EXCEL SPREADSHEET (CONVERTED TO TEXT FILE): see https://halshs.archives-ouvertes.fr/halshs-01068533/
# AND
# (ii) A PRAAT TEXT GRID
# AND WRITES THE OUTPUT TO AN XML FILE
# 18 November 2014
#-----------------------------------------------------------------------------------
# Written by Mac Dang Khoa, International Research Institute MICA
# DoReMiFa Project 2014
# Instruction - How to use ---------
# 1.Copy excel sheet of Wordlist and paste to Wordlist_example.txt file (Notepad++ is recommend.).
# Change filename of Wordlist_example.txt to same name with Wordlist excel file)
# 2.Modify the parameters.xlsx as you needs, copy and paste to parameters.txt
# 3.Put all file parameters.txt; .TextGrid; wordlist.txt in the same folder with script file
# 4.Open this script in Praat (version > 5.0), In Praat>Preferences, check that Text writing Preferences is set to 'UTF-8',
# 5.Run the script (Ctrl + R)
# 5.The output file "[TextGridName].xml" output will be create in the same folder with script file
# 6.Copy "[TextGridName].xml" to folder Views\Annotation
# 7.Put the audio file (same name with the textgrid file .wav) to the Views\audio
# 8.Open and view .xml file in FireFox (with QuickTime plug-in)
#------------------------------------------------------------------------------------
# INTERFACE
#-----------------------------------------------------------------------------------
#form Files
#sentence inputDir ./
#sentence outputDir ./
#endform
inputDir$ = "./"
outputDir$ = "./"
#------------------------------------------------------------------------------------
# BEGIN MAIN PROGRAM
#-----------------------------------------------------------------------------------
#-----------------------------------------------------------------------------------
clearinfo
# -------------Set Directory ---------------------------------------------------------
#--- Change this to the containing folder of script file, audio, txt and Textgrid file
curFolder$ = inputDir$
wavDirectory$=curFolder$
resultDirectory$=outputDir$
# -----Read parameters.txt file to a Table ---------------------------------------
Read from file... 'curFolder$'parameters.txt
Rename... tableParameters
# -----Read parameters from ParmetersTable ---------------------------------------
selectObject: "Table tableParameters"
# ------- Get WordlistTitle --------
intWordlistTitleLineNumber = Search column: "Parameter", "WordlistTitle"
strWordlistTitle$ = Get value: 'intWordlistTitleLineNumber', "Value"
# ------- Get WordlistLanguage --------
intWordlistLanguageLineNumber = Search column: "Parameter", "WordlistLanguage"
strWordlistLanguage$ = Get value: 'intWordlistLanguageLineNumber', "Value"
# ------- Get TextGridName --------
intTextGridNameLineNumber = Search column: "Parameter", "TextGridName"
strTextGridName$ = Get value: 'intTextGridNameLineNumber', "Value"
# ------- Get Title --------
intTitleLineNumber = Search column: "Parameter", "Title"
strTitle$ = Get value: 'intTitleLineNumber', "Value"
# ------- Get TitleLanguage --------
intTitleLanguageLineNumber = Search column: "Parameter", "TitleLanguage"
strTitleLanguage$ = Get value: 'intTitleLanguageLineNumber', "Value"
# ------- Get Note --------
intNoteLineNumber = Search column: "Parameter", "Note"
strNote$ = Get value: 'intNoteLineNumber', "Value"
# ------- Get NoteLanguage --------
intNoteLanguageLineNumber = Search column: "Parameter", "NoteLanguage"
strNoteLanguage$ = Get value: 'intNoteLanguageLineNumber', "Value"
# ------- Get NumberOfSpeaker --------
intNumberOfSpeakerLineNumber = Search column: "Parameter", "NumberOfSpeaker"
intNumberOfSpeaker = Get value: 'intNumberOfSpeakerLineNumber', "Value"
printline 'intNumberOfSpeaker'
# ------- Get CommentLanguage --------
intCommentLanguageLineNumber = Search column: "Parameter", "CommentLanguage"
strCommentLanguage$ = Get value: 'intCommentLanguageLineNumber', "Value"
# ------- Get NumberOfTranscription --------
intNumberOfTranscriptionLineNumber = Search column: "Parameter", "NumberOfTranscription"
intNumberOfTranscription = Get value: 'intNumberOfTranscriptionLineNumber', "Value"
# ------- Get Transcriptions (zero, one or many transcriptions) info and put all infors in array --------
if intNumberOfTranscription > 0
for i from 1 to intNumberOfTranscription
printline i ====== 'i'
strTranscriptionTitle$ = "TranscriptionTitle"+string$(i)
printline 'strTranscriptionTitle$'
intTranscriptionTitleLineNumber[i] = Search column: "Parameter", strTranscriptionTitle$
intLine = intTranscriptionTitleLineNumber[i]
#printline 'intLine'
strTranscriptionTitle$[i] = Get value: intTranscriptionTitleLineNumber[i], "Value"
strText$ = strTranscriptionTitle$[i]
printline 'strText$'
# ---- Get kind of notation ---------
strTranscriptionType$ = "TranscriptionType"+string$(i)
printline 'strTranscriptionType$'
intTranscriptionTypeLineNumber[i] = Search column: "Parameter", strTranscriptionType$
intLine = intTranscriptionTypeLineNumber[i]
printline 'intLine'
strTranscriptionType$[i] = Get value: intTranscriptionTypeLineNumber[i], "Value"
strText$ = strTranscriptionType$[i]
printline 'strText$'
endfor
endif
# -------- Set output file name: TextGridName.xml
outFileName$ = strTextGridName$+".xml"
outFilePath$= "'curFolder$''outFileName$'"
printline 'curFolder$'
printline 'outFilePath$'
# ------ Create output file -------
# -------delete file if already exist -----------------
if fileReadable (outFileName$)
printline delete 'outFileName$'
deleteFile(outFileName$)
endif
# -------Read Wordlist in .txt file to a Table ---------------------------------------
Read from file... 'curFolder$''strWordlistTitle$'.txt
Rename... tableAllData
# ----- Read .TextGrid file ----------------------------------------
Read from file... 'curFolder$''strTextGridName$'.TextGrid
Rename... textGridAllData
# ------- Write header of XML output file ---------------------------------------------------------
fileappend 'outFileName$' <?xml version="1.0" encoding="utf-8"?> 'newline$'
fileappend 'outFileName$' <?xml-stylesheet type="text/xsl" href="view_text.xsl"?> 'newline$'
fileappend 'outFileName$' <WORDLIST id="'strTextGridName$'" xml:lang="'strWordlistLanguage$'"> 'newline$'
fileappend 'outFileName$' 'tab$' <HEADER> 'newline$'
fileappend 'outFileName$' 'tab$' 'tab$' <TITLE xml:lang="'strTitleLanguage$'">'strTitle$'</TITLE> 'newline$'
fileappend 'outFileName$' 'tab$' 'tab$' <SOUNDFILE href="../audio/'strTextGridName$'.wav"/> 'newline$'
fileappend 'outFileName$' 'tab$' </HEADER> 'newline$'
fileappend 'outFileName$' 'tab$' <NOTE xml:lang="'strNoteLanguage$'" message="'strNote$'"/> 'newline$'
# ------ Read info from Text Grid ----------
select TextGrid textGridAllData
numIntervals = Get number of intervals: 1
#------ Check all interval of TextGrid
wid = 0
#id = 865
for id from 1 to (numIntervals-1)
#for id from 1 to 5
# ----- Read the label of current Interval in TextGrid
curInterval = id + 1
select TextGrid textGridAllData
intervalText$= Get label of interval... 1 curInterval
#intervalText$ = replace$ (intervalText$, " ", "", 0)
#-------if label is not empty, take all information and write to file
if intervalText$ <> "" and intervalText$ <> " " and intervalText$ <> " " and intervalText$ <> " " and intervalText$ <> newline$
#printline TextGridText 'intervalText$'
wid = wid+1
printline
printline WID: ===== WID: 'wid' =========
#--- get index of symbol "_"
separatedIndex = index (intervalText$,"_")
#---- Get UID from TextGrid
if separatedIndex <> 0
uid$ = left$ (intervalText$, separatedIndex-1)
else
uid$ = intervalText$
endif
intUID_textgrid = number (uid$)
# ------ Get coresponding UID in Wordlist data
selectObject: "Table tableAllData"
str_intUID$ = ""
str_intUID$ = Search column: "UID", uid$
intUID = number (str_intUID$)
printline UID_TextGrid:'intUID_textgrid' RowNumberInDataSheet 'intUID'
#----- Get information of Speaker and Dysfluencies-------------------------
strLastCharacter$ = right$ (intervalText$, 1)
#printline LastCharacterOfInterval 'strLastCharacter$'
strMessageSpeaker$ =""
if intNumberOfSpeaker >1
strMessageSpeaker$ = "Speaker "+ mid$ (intervalText$ , separatedIndex + 1, 1)
endif
#----- Get information of Speaker and Dysfluencies-------------------------
strMessageDysfluencies$ = ""
if strLastCharacter$ = "O"
strMessageDysfluencies$ = "Overlap of speakers"
elsif strLastCharacter$ = "H"
strMessageDysfluencies$ = "Hesitation"
elsif strLastCharacter$ = "M"
strMessageDysfluencies$ = "Mispronunciation"
endif
printline Speaker: 'strMessageSpeaker$'
printline Dysfluencies: 'strMessageDysfluencies$'
# --------- Get starting and ending time of current Interval ----------------------------
select TextGrid textGridAllData
startTime = Get start point... 1 curInterval
endTime = Get end point... 1 curInterval
# ---------- Get the comment of current Interval in TextGrid ----------
time = (endTime + startTime)/2
commentIntervalNum = Get interval at time: 2, time
printline 'time'
strComment$ =""
strComment$ = Get label of interval: 2, commentIntervalNum
printline Comment: 'strComment$'
# ------------- Get infor form DataTable -------------------------------------------------------------
selectObject: "Table tableAllData"
# ------ Get word source (in columme 3) ----------
strSourceOfWordLabel$ = Get column label: 3
strLayerCode$ = Get value: 'intUID', "'strSourceOfWordLabel$'"
if strLayerCode$ = "I"
strLayer$ = "EFEO"
elsif strLayerCode$ = "II"
strLayer$ = "SOAS + CNRS"
elsif strLayerCode$ = "III"
strLayer$ = "Michel Ferlus"
elsif strLayerCode$ = "IV"
strLayer$ = "Frédéric Pain"
elsif strLayerCode$ = "V"
strLayer$ = "additions for Arem"
endif
printline WordOrigninalt: 'strLayerCode$': 'strLayer$'
#--- Get infor of Wortd Category
strFelusNumbering$ = Get value: 'intUID', "Ferlus's numbering"
strCategoryEN$ = Get value: 'intUID', "semantics"
strCategoryFR$ = Get value: 'intUID', "sémantique"
# --- Get the translation on many languages --------
strTransVN$ = Get value: 'intUID', "Vietnamese"
strTransEN$ = Get value: 'intUID', "English"
strTransCN$ = Get value: 'intUID', "Chinese [简体中文]"
strTransFR$ = Get value: 'intUID', "French"
strTransKM_Written$ = Get value: 'intUID', "Written Khmer"
strTransKM_StandardModern$ = Get value: 'intUID', "Standard (Modern) Khmer"
strTransKM_Tateyleu$ = Get value: 'intUID', "Tatey Leu Khmer"
strTransKM_WrittenVariant$ = Get value: 'intUID', "Written Khmer: variant"
strTransKM_StandardModernVariant$ = Get value: 'intUID', "Standard (Modern) Khmer: variant"
strTransKM_TateyleuVariant$ = Get value: 'intUID', "Tatey Leu Khmer: variant"
# --- Get transcription in one or many Target Languages --------
if intNumberOfTranscription > 0
for j from 1 to intNumberOfTranscription
strText$ = strTranscriptionTitle$[j]
strTargetLanguage$[j] = Get value: 'intUID', "'strText$'"
strTargetLanguageValue$ = strTargetLanguage$[j]
printline Target Language 'j': 'strTargetLanguageValue$'
endfor
endif
# ==================== Write to file XML ===============================================
fileappend 'outFileName$' 'tab$' <W id="W'wid'"> 'newline$'
# ------ FORM and notation -----------
if intNumberOfTranscription > 0
for k from 1 to intNumberOfTranscription
strTargetLanguageValue$ = strTargetLanguage$[k]
strstrTranscriptionTypeValue$ = strTranscriptionType$[k]
fileappend 'outFileName$' 'tab$''tab$' <FORM kindOf="'strstrTranscriptionTypeValue$'">'strTargetLanguageValue$'</FORM> 'newline$'
endfor
endif
# ------ Speaker number (if more than 1 speaker)-----------
if intNumberOfSpeaker >1
fileappend 'outFileName$' 'tab$''tab$' <NOTE xml:lang="eng" message="'strMessageSpeaker$'"/> 'newline$'
endif
# -------------- Dysfluencies -----------------
if strMessageDysfluencies$ <> ""
fileappend 'outFileName$' 'tab$''tab$' <NOTE xml:lang="eng" message="'strMessageDysfluencies$'"/> 'newline$'
endif
# -------------- Others NOTE -----------------
fileappend 'outFileName$' 'tab$''tab$' <NOTE xml:lang="eng" message="UID = 'uid$'"/> 'newline$'
fileappend 'outFileName$' 'tab$''tab$' <NOTE xml:lang="eng" message="Ferlus's numbering = 'strFelusNumbering$'"/> 'newline$'
fileappend 'outFileName$' 'tab$''tab$' <NOTE xml:lang="eng" message="Layer in word list: 'strLayerCode$' ('strLayer$')"/> 'newline$'
fileappend 'outFileName$' 'tab$''tab$' <NOTE xml:lang="eng" message="Category / Semantic field: 'strCategoryEN$'"/> 'newline$'
fileappend 'outFileName$' 'tab$''tab$' <NOTE xml:lang="fra" message="Catégorie / Champ sémantique: 'strCategoryFR$'"/> 'newline$'
if strComment$ <> "" and strComment$ <> " " and strComment$ <> " "
fileappend 'outFileName$' 'tab$''tab$' <NOTE xml:lang="'strCommentLanguage$'" message="Annotator's comment: 'strComment$'"/> 'newline$'
endif
# --------- Translation in many languages
if strTransVN$ <> "" and strTransVN$ <> "-"
fileappend 'outFileName$' 'tab$''tab$' <TRANSL xml:lang="vie">'strTransVN$'</TRANSL> 'newline$'
endif
if strTransEN$ <> "" and strTransEN$ <> "-"
fileappend 'outFileName$' 'tab$''tab$' <TRANSL xml:lang="eng">'strTransEN$'</TRANSL> 'newline$'
endif
if strTransCN$ <> "" and strTransCN$ <> "-"
fileappend 'outFileName$' 'tab$''tab$' <TRANSL xml:lang="cmn">'strTransCN$'</TRANSL> 'newline$'
endif
if strTransFR$ <> "" and strTransFR$ <> "-"
fileappend 'outFileName$' 'tab$''tab$' <TRANSL xml:lang="fra">'strTransFR$'</TRANSL> 'newline$'
endif
if strTransKM_Written$ <> "" and strTransKM_Written$ <> "-"
fileappend 'outFileName$' 'tab$''tab$' <TRANSL xml:lang="khm-written">'strTransKM_Written$'</TRANSL> 'newline$'
endif
if strTransKM_StandardModern$ <> "" and strTransKM_StandardModern$ <> "-"
fileappend 'outFileName$' 'tab$''tab$' <TRANSL xml:lang="khm-standard-modern">'strTransKM_StandardModern$'</TRANSL> 'newline$'
endif
if strTransKM_Tateyleu$ <> "" and strTransKM_Tateyleu$ <> "-"
fileappend 'outFileName$' 'tab$''tab$' <TRANSL xml:lang="khm-tateyleu">'strTransKM_Tateyleu$'</TRANSL> 'newline$'
endif
if strTransKM_WrittenVariant$ <> "" and strTransKM_WrittenVariant$ <> "-"
fileappend 'outFileName$' 'tab$''tab$' <TRANSL xml:lang="khm-written-variant">'strTransKM_WrittenVariant$'</TRANSL> 'newline$'
endif
if strTransKM_StandardModernVariant$ <> "" and strTransKM_StandardModernVariant$ <> "-"
fileappend 'outFileName$' 'tab$''tab$' <TRANSL xml:lang="khm-standard-modern-variant">'strTransKM_StandardModernVariant$'</TRANSL> 'newline$'
endif
if strTransKM_TateyleuVariant$ <> "" and strTransKM_TateyleuVariant$ <> "-"
fileappend 'outFileName$' 'tab$''tab$' <TRANSL xml:lang="khm-tateyleu-variant">'strTransKM_TateyleuVariant$'</TRANSL> 'newline$'
endif
# ------ starting and ending time: add 500ms before and after
startTimePlus = startTime - 0.500
endTimePlus = endTime + 0.500
#fileappend 'outFileName$' 'tab$''tab$' <AUDIO start="'startTimePlus:3'" end="'endTimePlus:3'"/> 'newline$'
fileappend 'outFileName$' 'tab$''tab$' <AUDIO start="'startTime:3'" end="'endTime:3'"/> 'newline$'
fileappend 'outFileName$' 'tab$' </W> 'newline$'
endif
endfor
fileappend 'outFileName$' </WORDLIST> 'newline$'
printline Done
Remove