Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable more ruff checks and fix #67

Merged
merged 5 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 29 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 19 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ python-levenshtein = "^0.26.1"
pytest = "^8.3.2"
pytest-cov = "^5.0.0"
pytest-mock = "^3.14.0"
ruff = "^0.7.1"

[tool.poetry.group.docs.dependencies]
mkdocs = "^1.6.0"
Expand All @@ -42,3 +43,21 @@ build-backend = "poetry.core.masonry.api"

[tool.pytest.ini_options]
addopts = "-v -p no:warnings --cov=src --cov-report=html --doctest-modules --ignore=run_app.py"

[tool.ruff]
target-version = "py310"

[tool.ruff.lint]
select = [
"E", # pycodestyle
"F", # Pyflakes
"I", # isort
"UP", # pyupgrade
"RUF", # ruff
"W", # pylint
]
ignore = [
"E501", # line too long
"RUF001", # ambiguous multiplication sign
"RUF003", # ambiguous fullwidth colon
]
14 changes: 7 additions & 7 deletions run_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def get_file_type(file_path):
if file_path.is_dir():
return "directory"
elif file_path.suffix == ".html":
if re.search("table_\d+.html", file_path.name):
if re.search(r"table_\d+.html", file_path.name):
return "linked_tables"
else:
return "main_text"
Expand Down Expand Up @@ -118,17 +118,17 @@ def read_file_structure(file_path, target_dir):
if ftype == "directory":
continue
elif ftype == "main_text":
base_file = re.sub("\.html", "", fpath)
base_file = re.sub(r"\.html", "", fpath)
structure = fill_structure(structure, base_file, "main_text", fpath)
structure = fill_structure(structure, base_file, "out_dir", out_dir)
elif ftype == "linked_tables":
base_file = re.sub("_table_\d+\.html", "", fpath)
base_file = re.sub(r"_table_\d+\.html", "", fpath)
structure = fill_structure(
structure, base_file, "linked_tables", fpath
)
structure = fill_structure(structure, base_file, "out_dir", out_dir)
elif ftype == "table_images":
base_file = re.sub("_table_\d+\..*", "", fpath)
base_file = re.sub(r"_table_\d+\..*", "", fpath)
structure = fill_structure(
structure, base_file, "table_images", fpath
)
Expand All @@ -143,11 +143,11 @@ def read_file_structure(file_path, target_dir):
else:
ftype = get_file_type(file_path)
if ftype == "main_text":
base_file = re.sub("\.html", "", file_path).split("/")[-1]
base_file = re.sub(r"\.html", "", file_path).split("/")[-1]
if ftype == "linked_tables":
base_file = re.sub("_table_\d+\.html", "", file_path).split("/")[-1]
base_file = re.sub(r"_table_\d+\.html", "", file_path).split("/")[-1]
if ftype == "table_images":
base_file = re.sub("_table_\d+\..*", "", file_path).split("/")[-1]
base_file = re.sub(r"_table_\d+\..*", "", file_path).split("/")[-1]
template = {
base_file: {
"main_text": "",
Expand Down
28 changes: 9 additions & 19 deletions src/abbreviation.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def __yield_lines_from_doc(self, doc_text):
yield line.strip()

def __conditions(self, candidate):
"""
r"""
Based on Schwartz&Hearst

2 <= len(str) <= 10
Expand Down Expand Up @@ -52,10 +52,10 @@ def __best_candidates(self, sentence):
if "(" in sentence:
# Check some things first
if sentence.count("(") != sentence.count(")"):
raise ValueError("Unbalanced parentheses: {}".format(sentence))
raise ValueError(f"Unbalanced parentheses: {sentence}")

if sentence.find("(") > sentence.find(")"):
raise ValueError("First parentheses is right: {}".format(sentence))
raise ValueError(f"First parentheses is right: {sentence}")

close_index = -1
while 1:
Expand Down Expand Up @@ -139,7 +139,7 @@ def __get_definition(self, candidate, sentence):
start_index = len(first_chars) - 1
while count < candidate_freq:
if abs(start) > len(first_chars):
raise ValueError("candidate {} not found".format(candidate))
raise ValueError(f"candidate {candidate} not found")
start -= 1
# Look up key in the definition
try:
Expand Down Expand Up @@ -214,9 +214,7 @@ def __select_definition(self, definition, abbrev):
l_index -= 1
if l_index == -1 * (len(definition) + 1):
raise ValueError(
"definition {} was not found in {}".format(
abbrev, definition
)
f"definition {abbrev} was not found in {definition}"
)

else:
Expand Down Expand Up @@ -271,19 +269,15 @@ def __extract_abbreviation_definition_pairs(
definition = self.__get_definition(candidate, clean_sentence)
except (ValueError, IndexError) as e:
self.log.debug(
"{} Omitting candidate {}. Reason: {}".format(
i, candidate, e.args[0]
)
f"{i} Omitting candidate {candidate}. Reason: {e.args[0]}"
)
omit += 1
else:
try:
definition = self.__select_definition(definition, candidate)
except (ValueError, IndexError) as e:
self.log.debug(
"{} Omitting definition {} for candidate {}. Reason: {}".format(
i, definition, candidate, e.args[0]
)
f"{i} Omitting definition {definition} for candidate {candidate}. Reason: {e.args[0]}"
)
omit += 1
else:
Expand All @@ -295,12 +289,8 @@ def __extract_abbreviation_definition_pairs(
abbrev_map[candidate] = definition
written += 1
except (ValueError, IndexError) as e:
self.log.debug(
"{} Error processing sentence {}: {}".format(i, sentence, e.args[0])
)
self.log.debug(
"{} abbreviations detected and kept ({} omitted)".format(written, omit)
)
self.log.debug(f"{i} Error processing sentence {sentence}: {e.args[0]}")
self.log.debug(f"{written} abbreviations detected and kept ({omit} omitted)")

# Return most common definition for each term
if collect_definitions:
Expand Down
6 changes: 3 additions & 3 deletions src/autoCORPus.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import sys
from pathlib import Path

from bioc import biocxml, biocjson
from bioc import biocjson, biocxml
from bs4 import BeautifulSoup

from src.abbreviation import abbreviations
Expand All @@ -18,7 +18,7 @@ def handle_path(func):
def inner_function(*args, **kwargs):
try:
return func(*args, **kwargs)
except IOError as io:
except OSError as io:
print(io)
sys.exit()
except OSError as exc:
Expand Down Expand Up @@ -62,7 +62,7 @@ def __validate_infile(self):
def __soupify_infile(self, fpath):
fpath = Path(fpath)
try:
with open(fpath, "r", encoding="utf-8") as fp:
with open(fpath, encoding="utf-8") as fp:
soup = BeautifulSoup(fp.read(), "html.parser")
for e in soup.find_all(
attrs={"style": ["display:none", "visibility:hidden"]}
Expand Down
2 changes: 1 addition & 1 deletion src/references.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class references:

def __create_reference_block(self, reference):
text = reference["node"].get_text().replace("Go to:", "").replace("\n", "")
text = re.sub("\s{2,}", " ", text)
text = re.sub(r"\s{2,}", " ", text)
refSection = {
"section_heading": self.section_heading,
"subsection_heading": "",
Expand Down
8 changes: 4 additions & 4 deletions src/section.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def __get_abbreviations(self, soup_section):
abbreviations_tables = abbreviations_tables[0]["node"]
abbreviations = {}
for tr in abbreviations_tables.find_all("tr"):
short_form, long_form = [td.get_text() for td in tr.find_all("td")]
short_form, long_form = (td.get_text() for td in tr.find_all("td"))
abbreviations[short_form] = long_form
except Exception:
abbreviations = {}
Expand All @@ -96,9 +96,9 @@ def __set_IAO(self):
if h2_tmp != "":
if any(x in h2_tmp for x in [" and ", "&", "/"]):
mapping_result = []
h2_parts = re.split(" and |\s?/\s?|\s?&\s?", h2_tmp)
h2_parts = re.split(r" and |\s?/\s?|\s?&\s?", h2_tmp)
for h2_part in h2_parts:
h2_part = re.sub("^\d*\s?[\(\.]]?\s?", "", h2_part)
h2_part = re.sub(r"^\d*\s?[\(\.]]?\s?", "", h2_part)
pass
for IAO_term, heading_list in mapping_dict.items():
if any(
Expand All @@ -112,7 +112,7 @@ def __set_IAO(self):

else:
for IAO_term, heading_list in mapping_dict.items():
h2_tmp = re.sub("^\d*\s?[\(\.]]?\s?", "", h2_tmp)
h2_tmp = re.sub(r"^\d*\s?[\(\.]]?\s?", "", h2_tmp)
if any(
[fuzz.ratio(h2_tmp, heading) > 80 for heading in heading_list]
):
Expand Down
33 changes: 16 additions & 17 deletions src/table.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re
from datetime import datetime
from itertools import product
from itertools import pairwise, product
from pathlib import Path

from src.utils import get_data_element_node, handle_tables, navigate_contents
Expand Down Expand Up @@ -69,8 +69,8 @@ def __table_to_2d(self, t, config):
# value += item.get_text()
# clean the cell
value = value.strip().replace("\u2009", " ").replace("&#x000a0;", " ")
value = re.sub("\s", " ", value)
value = re.sub("<\/?span[^>\n]*>?|<hr\/>?", "", value)
value = re.sub(r"\s", " ", value)
value = re.sub("<\\/?span[^>\n]*>?|<hr\\/>?", "", value)
value = re.sub("\\n", "", value)
if value.startswith("(") and value.endswith(")"):
value = value[1:-1]
Expand Down Expand Up @@ -104,12 +104,11 @@ def __check_superrow(self, row):

"""
cleaned_row = set(
[i for i in row if (str(i) != "") & (str(i) != "\n") & (str(i) != "None")]
i for i in row if (str(i) != "") & (str(i) != "\n") & (str(i) != "None")
)
return len(cleaned_row) == 1 and bool(
re.match("[a-zA-Z]", next(iter(cleaned_row)))
)
if len(cleaned_row) == 1 and bool(re.match("[a-zA-Z]", list(cleaned_row)[0])):
return True
else:
return False

def __find_format(self, header):
"""
Expand Down Expand Up @@ -138,7 +137,7 @@ def __find_format(self, header):
# identify special character
special_char_idx = []
for idx, part in enumerate(parts):
if part in ":|\/,;":
if part in r":|\/,;":
special_char_idx.append(idx)

# generate regex pattern
Expand All @@ -147,9 +146,9 @@ def __find_format(self, header):
for idx in range(len(parts)):
if idx in special_char_idx:
char = parts[idx]
pattern += "({})".format(char)
pattern += f"({char})"
else:
pattern += "(\w+)"
pattern += r"(\w+)"
pattern = re.compile(pattern)
return pattern
else:
Expand Down Expand Up @@ -188,7 +187,7 @@ def __split_format(self, pattern, s):
Raises:
KeyError: Raises an exception.
"""
return [i for i in re.split(r"[:|/,;]", s) if i not in ":|\/,;"]
return [i for i in re.split(r"[:|/,;]", s) if i not in r":|\/,;"]

def __get_headers(self, t, config):
"""
Expand Down Expand Up @@ -324,10 +323,10 @@ def __table2json(
continue
if row_idx in header_idx:
cur_header = [
table_2d[i] for i in [i for i in subheader_idx if row_idx in i][0]
table_2d[i] for i in next(i for i in subheader_idx if row_idx in i)
]
elif row_idx in superrow_idx:
cur_superrow = [i for i in row if i not in ["", "None"]][0]
cur_superrow = next(i for i in row if i not in ("", "None"))
else:
if cur_header != pre_header:
sections = []
Expand All @@ -354,7 +353,7 @@ def __table2json(

if len(tables) > 1:
for table_idx, table in enumerate(tables):
table["identifier"] += ".{}".format(table_idx + 1)
table["identifier"] += f".{table_idx + 1}"
return tables

def __reformat_table_json(self, table_json):
Expand Down Expand Up @@ -591,7 +590,7 @@ def __main(self, soup, config):

subheader_idx = []
tmp = [header_idx[0]]
for i, j in zip(header_idx, header_idx[1:]):
for i, j in pairwise(header_idx):
if j == i + 1:
tmp.append(j)
else:
Expand Down Expand Up @@ -648,7 +647,7 @@ def __init__(self, soup, config, file_name, base_dir):
file_name = Path(file_name).name
self.tableIdentifier = None
self.base_dir = base_dir
if re.search("_table_\d+\.html", file_name):
if re.search(r"_table_\d+\.html", file_name):
self.tableIdentifier = file_name.split("/")[-1].split("_")[-1].split(".")[0]
self.pval_regex = (
r"((\d+\.\d+)|(\d+))(\s?)[*××xX](\s{0,1})10[_]{0,1}([–−-])(\d+)"
Expand Down
1 change: 0 additions & 1 deletion src/table_image.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from datetime import datetime
from operator import itemgetter
Expand Down
Loading