-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathparser.py
148 lines (121 loc) · 6.54 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import os
import re
from dataclasses import dataclass, field
from locale import getpreferredencoding
from os import PathLike
from pathlib import Path
from typing import List, Union
from spacy import Errors, Language
from spacy_conll.utils import STANZA_AVAILABLE, UDPIPE_AVAILABLE
if STANZA_AVAILABLE:
from spacy_stanza.tokenizer import StanzaTokenizer
if UDPIPE_AVAILABLE:
from spacy_udpipe import UDPipeTokenizer
SENT_ID_RE = re.compile(r"(?<=# sent_id = )(\d+)")
@dataclass(eq=False, repr=False)
class ConllParser:
"""Constructor for a ConllParser, which is a wrapper around a spaCy-like parser with a ConllFormatter
component. This class simply provides convenience methods to parse text according to some options.
Constructor arguments:
:param nlp: instantiated spaCy-like parser
:param is_tokenized: whether or not the expected input format is pre-tokenized. This must correspond with how
'nlp' was initialized! If you initialized the 'nlp' object with 'init_parser', make sure you used 'is_tokenized'
in the same way
"""
nlp: Language
is_tokenized: bool = False
parser: str = field(init=False, default=None)
def __post_init__(self):
if "conll_formatter" not in self.nlp.pipe_names:
raise ValueError(Errors.E001.format(name="conll_formatter", opts=self.nlp.pipe_names))
# Figure out what kind of parser was provided (needed during data preparation)
if isinstance(self.nlp.tokenizer, StanzaTokenizer):
self.parser = "stanza"
import torch
# Fixes some pickling issues
# See https://github.com/explosion/spacy-stanza/issues/34
torch.set_num_threads(1)
elif isinstance(self.nlp.tokenizer, UDPipeTokenizer):
self.parser = "udpipe"
else:
self.parser = "spacy"
def __repr__(self) -> str:
return f"{self.__class__.__name__}(is_tokenized={self.is_tokenized}, parser={self.parser})"
def prepare_data(self, lines: List[str]) -> List[str]:
"""Prepares data according to whether or not is_tokenized was given and depending on the parser.
Each parser requires a different type of input when the data is pre_tokenized.
:param lines: a list of lines to process
:return: the lines in the correct format for the parser
"""
if self.is_tokenized:
if self.parser == "spacy":
lines = [l.split() for l in lines]
elif self.parser == "udpipe":
lines = [[l.split()] for l in lines]
return lines
def parse_file_as_conll(
self, input_file: Union[PathLike, Path, str], input_encoding: str = getpreferredencoding(), **kwargs
) -> str:
"""Parses a given input file with self.parser and returns its CoNLL output.
:param input_file: path to the input file to process
:param input_encoding: encoding of 'input_file'
:param kwargs: keyword arguments that will be passed to `parse_text_as_conll`
"""
text = Path(input_file).resolve().read_text(encoding=input_encoding)
return self.parse_text_as_conll(text, **kwargs)
def parse_text_as_conll(
self,
text: str,
n_process: int = 1,
no_force_counting: bool = False,
ignore_pipe_errors: bool = False,
no_split_on_newline: bool = False,
) -> str:
"""Parses a given text (string) with self.parser and returns its CoNLL output.
:param text: input text (string) to process
:param n_process: number of processes to use in nlp.pipe(). -1 will use as many cores as available. Might not
work for a 'parser' other than 'spacy' depending on your environment
:param no_force_counting: whether to disable force counting the 'sent_id', starting from 1 and increasing for
each sentence. Instead, 'sent_id' will depend on how spaCy returns the sentences. Must have
'self.include_headers' enabled
:param ignore_pipe_errors: whether to ignore a priori errors concerning 'n_process' By default we try to
determine whether processing works on your system and stop execution if we think it doesn't. If you
know what you are doing, you can ignore such pre-emptive errors, though, and run the code as-is, which
will then throw the default Python errors when applicable
:param no_split_on_newline: by default, the input text will be split on newlines for faster processing. This
can be disabled with this option
"""
if n_process > 1 and not ignore_pipe_errors:
if not self.nlp.get_pipe("conll_formatter").disable_pandas:
raise OSError(
"Due to pandas serialisation, 'n_process' > 1 is not supported when"
" 'disable_pandas' is False in the ConllFormatter. Set 'n_process' to 1 or"
" initialise the ConllFormatter with 'disable_pandas=True'"
)
# Seems that Windows only supports mp on spaCy. Both for UDPipe and Stanza the issue is
# pickling of the models
if os.name == "nt" and self.parser in ["udpipe", "stanza"]:
raise OSError(
"'n_process' > 1 is not supported on all platforms/all parsers. Please try again with"
" the default value 'n_process' = 1. You can also try to run the code without this pre-emptive"
" error message by using the 'ignore_pipe_errors' option"
)
if not no_split_on_newline:
text = text.splitlines()
text = self.prepare_data(text)
conll_idx = 0
output = ""
for doc_idx, doc in enumerate(self.nlp.pipe(text, n_process=n_process)):
for sent in doc.sents:
conll_idx += 1
sent_as_conll = sent._.conll_str
if self.nlp.get_pipe("conll_formatter").include_headers and not no_force_counting:
# nlp.pipe returns different docs, meaning that the generated sentence indices
# by ConllFormatter are not consecutive (they reset for each new doc)
# We can do a regex replace to fix that, though.
sent_as_conll = re.sub(SENT_ID_RE, str(conll_idx), sent_as_conll, 1)
# Prepend additional newline for all except the very first string.
if not (doc_idx == 0 and sent.start == 0):
sent_as_conll = "\n" + sent_as_conll
output += sent_as_conll
return output