omicsNLP · Thomas-Rowlands · Oct 31, 2024 · Oct 31, 2024 · Oct 31, 2024 · Oct 31, 2024
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,6 +26,7 @@ python-levenshtein = "^0.26.1"
 pytest = "^8.3.2"
 pytest-cov = "^5.0.0"
 pytest-mock = "^3.14.0"
+ruff = "^0.7.1"
 
 [tool.poetry.group.docs.dependencies]
 mkdocs = "^1.6.0"
@@ -42,3 +43,21 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.pytest.ini_options]
 addopts = "-v -p no:warnings --cov=src --cov-report=html --doctest-modules --ignore=run_app.py"
+
+[tool.ruff]
+target-version = "py310"
+
+[tool.ruff.lint]
+select = [
+    "E",   # pycodestyle
+    "F",   # Pyflakes
+    "I",   # isort
+    "UP",  # pyupgrade
+    "RUF", # ruff
+    "W",   # pylint
+]
+ignore = [
+    "E501",   # line too long
+    "RUF001", # ambiguous multiplication sign
+    "RUF003", # ambiguous fullwidth colon
+]
diff --git a/run_app.py b/run_app.py
@@ -58,7 +58,7 @@ def get_file_type(file_path):
     if file_path.is_dir():
         return "directory"
     elif file_path.suffix == ".html":
-        if re.search("table_\d+.html", file_path.name):
+        if re.search(r"table_\d+.html", file_path.name):
             return "linked_tables"
         else:
             return "main_text"
@@ -118,17 +118,17 @@ def read_file_structure(file_path, target_dir):
                 if ftype == "directory":
                     continue
                 elif ftype == "main_text":
-                    base_file = re.sub("\.html", "", fpath)
+                    base_file = re.sub(r"\.html", "", fpath)
                     structure = fill_structure(structure, base_file, "main_text", fpath)
                     structure = fill_structure(structure, base_file, "out_dir", out_dir)
                 elif ftype == "linked_tables":
-                    base_file = re.sub("_table_\d+\.html", "", fpath)
+                    base_file = re.sub(r"_table_\d+\.html", "", fpath)
                     structure = fill_structure(
                         structure, base_file, "linked_tables", fpath
                     )
                     structure = fill_structure(structure, base_file, "out_dir", out_dir)
                 elif ftype == "table_images":
-                    base_file = re.sub("_table_\d+\..*", "", fpath)
+                    base_file = re.sub(r"_table_\d+\..*", "", fpath)
                     structure = fill_structure(
                         structure, base_file, "table_images", fpath
                     )
@@ -143,11 +143,11 @@ def read_file_structure(file_path, target_dir):
         else:
             ftype = get_file_type(file_path)
             if ftype == "main_text":
-                base_file = re.sub("\.html", "", file_path).split("/")[-1]
+                base_file = re.sub(r"\.html", "", file_path).split("/")[-1]
             if ftype == "linked_tables":
-                base_file = re.sub("_table_\d+\.html", "", file_path).split("/")[-1]
+                base_file = re.sub(r"_table_\d+\.html", "", file_path).split("/")[-1]
             if ftype == "table_images":
-                base_file = re.sub("_table_\d+\..*", "", file_path).split("/")[-1]
+                base_file = re.sub(r"_table_\d+\..*", "", file_path).split("/")[-1]
             template = {
                 base_file: {
                     "main_text": "",

diff --git a/src/abbreviation.py b/src/abbreviation.py
@@ -12,7 +12,7 @@ def __yield_lines_from_doc(self, doc_text):
             yield line.strip()
 
     def __conditions(self, candidate):
-        """
+        r"""
         Based on Schwartz&Hearst
 
         2 <= len(str) <= 10
@@ -52,10 +52,10 @@ def __best_candidates(self, sentence):
         if "(" in sentence:
             # Check some things first
             if sentence.count("(") != sentence.count(")"):
-                raise ValueError("Unbalanced parentheses: {}".format(sentence))
+                raise ValueError(f"Unbalanced parentheses: {sentence}")
 
             if sentence.find("(") > sentence.find(")"):
-                raise ValueError("First parentheses is right: {}".format(sentence))
+                raise ValueError(f"First parentheses is right: {sentence}")
 
             close_index = -1
             while 1:
@@ -139,7 +139,7 @@ def __get_definition(self, candidate, sentence):
             start_index = len(first_chars) - 1
             while count < candidate_freq:
                 if abs(start) > len(first_chars):
-                    raise ValueError("candidate {} not found".format(candidate))
+                    raise ValueError(f"candidate {candidate} not found")
                 start -= 1
                 # Look up key in the definition
                 try:
@@ -214,9 +214,7 @@ def __select_definition(self, definition, abbrev):
                     l_index -= 1
                     if l_index == -1 * (len(definition) + 1):
                         raise ValueError(
-                            "definition {} was not found in {}".format(
-                                abbrev, definition
-                            )
+                            f"definition {abbrev} was not found in {definition}"
                         )
 
             else:
@@ -271,19 +269,15 @@ def __extract_abbreviation_definition_pairs(
                         definition = self.__get_definition(candidate, clean_sentence)
                     except (ValueError, IndexError) as e:
                         self.log.debug(
-                            "{} Omitting candidate {}. Reason: {}".format(
-                                i, candidate, e.args[0]
-                            )
+                            f"{i} Omitting candidate {candidate}. Reason: {e.args[0]}"
                         )
                         omit += 1
                     else:
                         try:
                             definition = self.__select_definition(definition, candidate)
                         except (ValueError, IndexError) as e:
                             self.log.debug(
-                                "{} Omitting definition {} for candidate {}. Reason: {}".format(
-                                    i, definition, candidate, e.args[0]
-                                )
+                                f"{i} Omitting definition {definition} for candidate {candidate}. Reason: {e.args[0]}"
                             )
                             omit += 1
                         else:
@@ -295,12 +289,8 @@ def __extract_abbreviation_definition_pairs(
                                 abbrev_map[candidate] = definition
                             written += 1
             except (ValueError, IndexError) as e:
-                self.log.debug(
-                    "{} Error processing sentence {}: {}".format(i, sentence, e.args[0])
-                )
-        self.log.debug(
-            "{} abbreviations detected and kept ({} omitted)".format(written, omit)
-        )
+                self.log.debug(f"{i} Error processing sentence {sentence}: {e.args[0]}")
+        self.log.debug(f"{written} abbreviations detected and kept ({omit} omitted)")
 
         # Return most common definition for each term
         if collect_definitions:

diff --git a/src/autoCORPus.py b/src/autoCORPus.py
@@ -3,7 +3,7 @@
 import sys
 from pathlib import Path
 
-from bioc import biocxml, biocjson
+from bioc import biocjson, biocxml
 from bs4 import BeautifulSoup
 
 from src.abbreviation import abbreviations
@@ -18,7 +18,7 @@ def handle_path(func):
     def inner_function(*args, **kwargs):
         try:
             return func(*args, **kwargs)
-        except IOError as io:
+        except OSError as io:
             print(io)
             sys.exit()
         except OSError as exc:
@@ -62,7 +62,7 @@ def __validate_infile(self):
     def __soupify_infile(self, fpath):
         fpath = Path(fpath)
         try:
-            with open(fpath, "r", encoding="utf-8") as fp:
+            with open(fpath, encoding="utf-8") as fp:
                 soup = BeautifulSoup(fp.read(), "html.parser")
                 for e in soup.find_all(
                     attrs={"style": ["display:none", "visibility:hidden"]}

diff --git a/src/references.py b/src/references.py
@@ -28,7 +28,7 @@ class references:
 
     def __create_reference_block(self, reference):
         text = reference["node"].get_text().replace("Go to:", "").replace("\n", "")
-        text = re.sub("\s{2,}", " ", text)
+        text = re.sub(r"\s{2,}", " ", text)
         refSection = {
             "section_heading": self.section_heading,
             "subsection_heading": "",

diff --git a/src/section.py b/src/section.py
@@ -77,7 +77,7 @@ def __get_abbreviations(self, soup_section):
                 abbreviations_tables = abbreviations_tables[0]["node"]
                 abbreviations = {}
                 for tr in abbreviations_tables.find_all("tr"):
-                    short_form, long_form = [td.get_text() for td in tr.find_all("td")]
+                    short_form, long_form = (td.get_text() for td in tr.find_all("td"))
                     abbreviations[short_form] = long_form
             except Exception:
                 abbreviations = {}
@@ -96,9 +96,9 @@ def __set_IAO(self):
         if h2_tmp != "":
             if any(x in h2_tmp for x in [" and ", "&", "/"]):
                 mapping_result = []
-                h2_parts = re.split(" and |\s?/\s?|\s?&\s?", h2_tmp)
+                h2_parts = re.split(r" and |\s?/\s?|\s?&\s?", h2_tmp)
                 for h2_part in h2_parts:
-                    h2_part = re.sub("^\d*\s?[\(\.]]?\s?", "", h2_part)
+                    h2_part = re.sub(r"^\d*\s?[\(\.]]?\s?", "", h2_part)
                     pass
                     for IAO_term, heading_list in mapping_dict.items():
                         if any(
@@ -112,7 +112,7 @@ def __set_IAO(self):
 
             else:
                 for IAO_term, heading_list in mapping_dict.items():
-                    h2_tmp = re.sub("^\d*\s?[\(\.]]?\s?", "", h2_tmp)
+                    h2_tmp = re.sub(r"^\d*\s?[\(\.]]?\s?", "", h2_tmp)
                     if any(
                         [fuzz.ratio(h2_tmp, heading) > 80 for heading in heading_list]
                     ):

diff --git a/src/table.py b/src/table.py
@@ -1,6 +1,6 @@
 import re
 from datetime import datetime
-from itertools import product
+from itertools import pairwise, product
 from pathlib import Path
 
 from src.utils import get_data_element_node, handle_tables, navigate_contents
@@ -69,8 +69,8 @@ def __table_to_2d(self, t, config):
                 # 		value += item.get_text()
                 # clean the cell
                 value = value.strip().replace("\u2009", " ").replace("&#x000a0;", " ")
-                value = re.sub("\s", " ", value)
-                value = re.sub("<\/?span[^>\n]*>?|<hr\/>?", "", value)
+                value = re.sub(r"\s", " ", value)
+                value = re.sub("<\\/?span[^>\n]*>?|<hr\\/>?", "", value)
                 value = re.sub("\\n", "", value)
                 if value.startswith("(") and value.endswith(")"):
                     value = value[1:-1]
@@ -104,12 +104,11 @@ def __check_superrow(self, row):
 
         """
         cleaned_row = set(
-            [i for i in row if (str(i) != "") & (str(i) != "\n") & (str(i) != "None")]
+            i for i in row if (str(i) != "") & (str(i) != "\n") & (str(i) != "None")
+        )
+        return len(cleaned_row) == 1 and bool(
+            re.match("[a-zA-Z]", next(iter(cleaned_row)))
         )
-        if len(cleaned_row) == 1 and bool(re.match("[a-zA-Z]", list(cleaned_row)[0])):
-            return True
-        else:
-            return False
 
     def __find_format(self, header):
         """
@@ -138,7 +137,7 @@ def __find_format(self, header):
         # identify special character
         special_char_idx = []
         for idx, part in enumerate(parts):
-            if part in ":|\/,;":
+            if part in r":|\/,;":
                 special_char_idx.append(idx)
 
         # generate regex pattern
@@ -147,9 +146,9 @@ def __find_format(self, header):
             for idx in range(len(parts)):
                 if idx in special_char_idx:
                     char = parts[idx]
-                    pattern += "({})".format(char)
+                    pattern += f"({char})"
                 else:
-                    pattern += "(\w+)"
+                    pattern += r"(\w+)"
             pattern = re.compile(pattern)
             return pattern
         else:
@@ -188,7 +187,7 @@ def __split_format(self, pattern, s):
         Raises:
                 KeyError: Raises an exception.
         """
-        return [i for i in re.split(r"[:|/,;]", s) if i not in ":|\/,;"]
+        return [i for i in re.split(r"[:|/,;]", s) if i not in r":|\/,;"]
 
     def __get_headers(self, t, config):
         """
@@ -324,10 +323,10 @@ def __table2json(
                 continue
             if row_idx in header_idx:
                 cur_header = [
-                    table_2d[i] for i in [i for i in subheader_idx if row_idx in i][0]
+                    table_2d[i] for i in next(i for i in subheader_idx if row_idx in i)
                 ]
             elif row_idx in superrow_idx:
-                cur_superrow = [i for i in row if i not in ["", "None"]][0]
+                cur_superrow = next(i for i in row if i not in ("", "None"))
             else:
                 if cur_header != pre_header:
                     sections = []
@@ -354,7 +353,7 @@ def __table2json(
 
         if len(tables) > 1:
             for table_idx, table in enumerate(tables):
-                table["identifier"] += ".{}".format(table_idx + 1)
+                table["identifier"] += f".{table_idx + 1}"
         return tables
 
     def __reformat_table_json(self, table_json):
@@ -591,7 +590,7 @@ def __main(self, soup, config):
 
             subheader_idx = []
             tmp = [header_idx[0]]
-            for i, j in zip(header_idx, header_idx[1:]):
+            for i, j in pairwise(header_idx):
                 if j == i + 1:
                     tmp.append(j)
                 else:
@@ -648,7 +647,7 @@ def __init__(self, soup, config, file_name, base_dir):
         file_name = Path(file_name).name
         self.tableIdentifier = None
         self.base_dir = base_dir
-        if re.search("_table_\d+\.html", file_name):
+        if re.search(r"_table_\d+\.html", file_name):
             self.tableIdentifier = file_name.split("/")[-1].split("_")[-1].split(".")[0]
         self.pval_regex = (
             r"((\d+\.\d+)|(\d+))(\s?)[*××xX](\s{0,1})10[_]{0,1}([–−-])(\d+)"

diff --git a/src/table_image.py b/src/table_image.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 
 from datetime import datetime
 from operator import itemgetter