forked from czcorpus/xmlanntools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxml2vrt
executable file
·423 lines (378 loc) · 19.8 KB
/
xml2vrt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright (c) 2024 Pavel Vondřička <[email protected]>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; version 2
# dated June, 1991.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
import xml.sax
import html
import re
import sys
wrapper_element_name = 'wrapper_root'
class TaggedXMLContentHandler(xml.sax.ContentHandler):
"""
SAX parser for extraction of vertical from a tagged XML file
"""
def __init__(self, config, wrapper_tag=None):
xml.sax.ContentHandler.__init__(self)
# current token contents (also indicates we are within the scope of a token)
self.token = None
# temporary buffer of open elements
self.openTagBuffer = []
# current whitespace buffer (relevant for glue)
self.whitespace = ''
# was last element a token? (relevant for glue)
self.last_was_token = False
# token counter (any use?)
self.tokencnt = 0
# tracking the XML hierarchy of elements from the root to the currently open element
self.levels = []
# settings from the configuration
self.config = config
self.strip_word_xml = not config.getboolean('keep_token_tags', False)
self.glue_element = config.get('glue', 'g') if not config.getboolean('no_glue', False) else None
self.attrnames = config.get('attributes', '').replace(',',' ').split()
self.token_element = config.get('token_element', 'w')
self.keep_empty = config.getboolean('keep_empty', False)
self.discard_empty = config.get('discard_empty', '').replace(',',' ').split()
self.discard_freetext = config.getboolean('discard_freetext')
self.exclude_elements = config.get('exclude_elements', '').replace(',',' ').split()
self.include_elements = config.get('include_elements', '').replace(',',' ').split()
self.wrapper_tag = wrapper_tag
self.no_flattening = config.getboolean('no_flattening', False)
self.flat_override = self.config.get('flat_override', False)
self.flat_separator = self.config.get('flat_separator', ' ')
self.flat_levelattr = self.config.get('flat_level_attribute', 'nested_level')
# if no included elements configured, just extract everything (i.e. the whole root element) by default
self.include_scope = not len(self.include_elements)
# excluded elements may be nested, check correct level at the end
self.exclude_scope = 0
# regex for line breaks
self.lbre = re.compile(r"[\n\r]+")
def startElement(self, name, attrs):
# ignore wrapper tag
if name == self.wrapper_tag:
return
# scope of included/excluded elements starting?
if name in self.include_elements:
self.include_scope = True
if name in self.exclude_elements:
self.exclude_scope += 1
# ignore everything outside the scope of included elements
if self.exclude_scope > 0 or not self.include_scope:
return
if name == self.token_element:
# token element
# print out openTagBuffer
self.output_tag_buffer()
# token level elements become tokens
self.tokencnt += 1
self.token = {}
self.token['word'] = ''
for (key, value) in attrs.items():
self.token[key] = value
else:
# other XML element
if self.no_flattening:
# no flattening: append to the tag buffer
contents = " " + " ".join([f'{a}="{html.escape(attrs.getValue(a))}"' for a in attrs.keys()]) if len(attrs) else ''
self.openTagBuffer.append((name, contents, True))
if not self.token:
self.last_was_token = False
else:
# flatten nested elements
first = self.close_flat_element(name)
# append element to the levels heap (hierarchy)
self.levels.append((name, attrs))
self.reopen_flat_elements(name, first=first, original=True)
def endElement(self, name):
# ignore wrapper tag
if name == self.wrapper_tag:
return
# ignore everything outside the scope of included elements
if self.exclude_scope > 0 or not self.include_scope:
# leaving the scope of excluded elements?
if name in self.exclude_elements:
self.exclude_scope -= 1
return
# excluded elements are ignored
if name in self.exclude_elements:
return
if name == self.token_element:
# token element
# print out openTagBuffer
self.output_tag_buffer()
# print glue element?
if self.glue_element is not None and self.whitespace == '' and self.last_was_token:
print("<{0}/>".format(self.glue_element))
# print token
print(self.token['word'], end="")
attrcnt = 1
# append all confugured (named) attributes
for attrname in self.attrnames:
if attrname in self.token:
print("\t" + self.token[attrname], end="")
attrcnt += 1
else:
print("\t", end="")
# any more unnamed attributes to append? (autogenerated/numbered)
nextattr = 'attr_' + str(attrcnt)
while nextattr in self.token:
print("\t" + self.token[nextattr], end="")
attrcnt += 1
nextattr = 'attr_' + str(attrcnt)
print()
self.token = None
self.whitespace = ''
self.last_was_token = True
else:
# other XML element
if self.no_flattening:
# there are elements in the buffer of open elements
if not self.keep_empty and name == self.openTagBuffer[-1][0] \
and (len(self.discard_empty) == 0 or name in self.discard_empty):
# empty element to be ignored
self.openTagBuffer.pop()
return
else:
# print out openTagBuffer
self.output_tag_buffer(lastName=name)
if self.token:
if not self.strip_word_xml:
self.token['word'] += '</'+name+'>'
else:
print('</'+name+'>')
self.last_was_token = False
else:
first = self.close_flat_element(name, original=True)
# remove the element from the levels heap (hierarchy)
ename, _ = self.levels.pop()
if name != ename:
# this should never happen (unless there is a bug)!
raise Exception(f"Unexpected situation: End of '{name}', but last open element was '{ename}'.")
self.reopen_flat_elements(name, first=first)
# Leaving scope of included elements?
if name in self.include_elements:
self.include_scope = False
def characters(self, contents):
# ignore everything outside the scope of included elements
if self.exclude_scope > 0 or not self.include_scope:
return
if self.token or (not self.discard_freetext and contents.strip()):
# print out openTagBuffer
self.output_tag_buffer()
if self.token:
# add contents to current token
self.token['word'] += contents
elif not self.discard_freetext and contents.strip():
# if non-token (free) text contents allowed, output it as one single line "token"
print(self.lbre.sub(" ", contents.strip()))
elif self.glue_element is not None:
# just whitespace: relevant for glue insertion
self.whitespace += contents
def output_tag_buffer(self, lastName=None):
# print out openTagBuffer
for i, (tname, tcontents, _) in enumerate(self.openTagBuffer):
if lastName is not None and i == len(self.openTagBuffer)-1:
# the last element (lastName) is empty and shall be closed immediately
if tname != lastName:
# this should never happen (unless there is a bug)!
raise Exception(f"Unexpected situation: End of '{lastName}', but last open element was '{tname}'.")
tag = '<'+tname+tcontents+'/>'
else:
tag = '<'+tname+tcontents+'>'
if self.token:
# inside a token: append to its string contents ('word')
self.token['word'] += tag
else:
# just print out
print(tag)
self.openTagBuffer = []
def close_flat_element(self, name, original=False):
"""
Close the XML element of the given name and all subelements within its scope in the flattened hierarchy
"""
# close all elements from the current level up to the level of the given element
flat_levels = self.flat_levels()
first = next((i for i, x in enumerate(flat_levels) if x[0] == name), len(flat_levels))
for i in range(len(flat_levels)-1, first-1, -1):
curname, _ = flat_levels[i]
# close element
if len(self.openTagBuffer) > 0:
# there are elements in the buffer of open elements
is_original = self.openTagBuffer[-1][2] and original and curname == name
lastName = self.openTagBuffer[-1][0]
if not is_original or (not self.keep_empty and curname == lastName \
and (len(self.discard_empty) == 0 or curname in self.discard_empty)):
# empty element to be ignored
self.openTagBuffer.pop()
continue
else:
# print out openTagBuffer
self.output_tag_buffer(lastName=curname)
else:
# some other element => print closing element tag
if self.token:
if not self.strip_word_xml:
self.token['word'] += '</'+curname+'>'
else:
print('</'+curname+'>')
self.last_was_token = False
return first
def reopen_flat_elements(self, name, first=None, original=False):
"""
Reopen XML element of the given name and all subelements within its scope in the flattened hierarchy
"""
# reopen XML element and its subelements to the current level of depth again
flat_levels = self.flat_levels()
first = first or next((i for i, x in enumerate(flat_levels) if x[0] == name), len(flat_levels))
for i in range(first, len(flat_levels)):
curname, attrs = flat_levels[i]
# add to the tag buffer
contents = " " + " ".join([f'{a}="{html.escape(str(v))}"' for a, v in attrs.items()]) if len(attrs) else ''
is_original = original and name == curname
self.openTagBuffer.append((curname, contents, is_original))
if not self.token:
self.last_was_token = False
def flat_levels(self):
"""
Return map of levels flattened
(i.e. nested elements are compressed to the level of the most shallow one with all their attributes merged)
"""
# resulting flattened map of levels
flat = []
# map to remember the level of the most shallow (first in hierarchy) attribute of that name
map = {}
# count levels in hierarchy
cnt = 0
# process original levels of XML hiearchy
for name, attrs in self.levels:
# convert XML attributes to dict
attrs = {k: v for k, v in attrs.items()}
# already seen that attribute name before? (i.e. higher in the hierarchy)
index = map.get(name, None)
if index is None:
# first level where it appears: store in map and append to the resulting list
map[name] = cnt
flat.append((name, attrs))
else:
# already seen before: just append new attributes to the first (level) occurence
self.merge_attrs(name, flat[index][1], attrs)
cnt += 1
return flat
def merge_attrs(self, name, baseattrs, newattrs):
"""
Merge (add, overwrite or concatenate) attributes of elements of the same type
(modifies the baseattrs with newattrs in place)
name : str
name of the element
baseattrs : dict of str
base attributes (key: value) to merge the new attributes with
newattrs : dict of str
new attributes from a depper nested level to append to the baseattrs
"""
for (key, value) in newattrs.items():
# allow configuration to override attribute values instead of concatenating them
override = self.config.get(f'flat_override_{name}_{key}', self.flat_override)
if key not in baseattrs or override:
baseattrs[key] = value
else:
# not new and not to be overriden: concatenate values using default (or more specific, if specified) separator
sep = self.config.get(f'flat_separator_{name}_{key}', self.flat_separator)
baseattrs[key] += sep + value
# always add counter of nested level for all nested elements
baseattrs[self.flat_levelattr] = baseattrs.get(self.flat_levelattr, 0) + 1
if __name__ == "__main__":
"""
Convert tagged XML to VRT
=========================
Input: fully tagged input XML file name
Output: STDOUT.
Options: '-c <filename>' Read additional configuration. By default, the file 'vrt2standoff.ini' is
searched and loaded both from the directory with the scripts and the current working dir.
'-p <profile>' Use 'profile' section from the configuration. If not provided, the
section 'DEFAULT' will be used as fallback or the 'profile' specified in this section.
'-a <list_of_attributes>' comma separated list of positional attribute names in the vertical
file (no spaces!). The very first column is always the original token string itself,
the attribute names are thus applied from the second column on.
If (additional) attributes are provided by the token elements and they are named in the
corresponding order as 'attr_N', they will be automatically output as well.
Config setting: 'attributes' (may also be separated by spaces, commas, linebreaks or
a combination thereof)
'-kt' Keep XML tags within tokens. By default they are removed.
Config setting: 'keep_token_tags'.
'-ke' Keep empty elements in the vertical output. By default they are removed.
Config setting: 'keep_empty'.
'-df' Discard free text contents (non-tokenized text contents). By default, any free text contents
are extracted as fragments in the form of single line "tokens", if found in the structure.
Config setting: 'discard_freetext'
'-g <element_name>' name of glue element to mark tokens not separated by space.
Default: 'g'.
Config setting: 'glue'.
'-ng' Do NOT insert glue elements.
Config setting: 'no_glue'.
'-te <element_name>' Specify token element name.
Default: 'w'.
Config setting: 'token_element'.
'-i <element_names>' Comma separated list of (sub)element names to be extracted into the output vertical
(no spaces!). By default the whole root element of the XML document will be extracted.
Config setting: 'include_elements' (may also be separated by spaces, commas, linebreaks or
a combination thereof)
'-e <element_names>' Comma separated list of element names to be excluded from the output vertical
(no spaces!).
Config setting: 'exclude_elements' (may also be separated by spaces, commas, linebreaks or
a combination thereof)
'-F' Input file is a XML fragment (needing a wrapping root element).
'-nf' Do NOT flatten nested XML structures
"""
import os
import argparse
import configparser
from pathlib import Path
parser = argparse.ArgumentParser(description="Convert tagged XML file into vertical format")
parser.add_argument("infile", help="input XML file name")
parser.add_argument("-c", "--config", help="additional config file", type=str)
parser.add_argument("-p", "--profile", help="config profile to use", type=str, default='DEFAULT')
parser.add_argument("-a", "--attributes", help="attribute names (except first position, separated by comma)", type=str)
parser.add_argument("-te", "--token-element", help="name of token element", type=str)
parser.add_argument("-i", "--include-elements", help="(sub)elements to extract (default: document root)", type=str)
parser.add_argument("-e", "--exclude-elements", help="elements to exclude from the extraction", type=str)
parser.add_argument("-kt", "--keep-token-tags", help="keep tags within tokens", action="store_true")
parser.add_argument("-ke", "--keep-empty", help="keep empty elements", action="store_true")
parser.add_argument("-df", "--discard-freetext", help="discard free (non-token) text contents", action="store_true")
parser.add_argument("-ng", "--no-glue", help="do not insert glue element (indicating missing space between tokens)", action="store_true")
parser.add_argument("-g", "--glue", help="name of the glue element (default: 'g')", type=str)
parser.add_argument("-F", "--fragment", help="input file is a XML fragment (needing a wrapping root element)", action="store_true")
parser.add_argument("-nf", "--no-flattening", help="do not flatten nested XML structures", action="store_true")
args = parser.parse_args()
# read configuration from files
scriptpath = path = os.path.dirname(os.path.realpath(__file__))
curpath = os.getcwd()
profiles = configparser.ConfigParser()
profiles.read([scriptpath+'/ann2standoff.ini', curpath+'/ann2standoff.ini'])
if args.config:
read = profiles.read(args.config)
if read != [args.config]:
raise Exception("Failed reading configuration file: '{0}'".format(args.config))
# if no profile was specified, check whether the DEFAULT profile specifies a default profile
cur_profile = args.profile
if cur_profile == 'DEFAULT' and profiles[cur_profile].get('profile', None):
cur_profile = profiles[cur_profile].get('profile')
# evt. update/override currently selected profile configuration with values from command-line arguments
profiles.read_dict({cur_profile: {k: v for k, v in vars(args).items() if v is not None}})
config = profiles[cur_profile]
# input file
infile = Path(config.get('infile'))
with infile.open(encoding='utf-8') as source:
if config.getboolean('fragment', False):
source = "<"+wrapper_element_name+">"+source.read()+"</"+wrapper_element_name+">"
xml.sax.parseString(source, TaggedXMLContentHandler(config, wrapper_tag=wrapper_element_name))
else:
xml.sax.parse(source, TaggedXMLContentHandler(config))