diff --git a/albero/__init__.py b/albero/__init__.py index 5137909..4fb251d 100644 --- a/albero/__init__.py +++ b/albero/__init__.py @@ -2,7 +2,8 @@ beartype_this_package() +from token_tools import GENERIC_TOKENS, Token # noqa: F401, E402 + from .languages import get_lang, get_mapping # noqa: F401, E402 from .misc import AlberoException, lang_from_so # noqa: F401, E402 -from .tokens import Token, generic_tokens # noqa: F401, E402 from .tree_sitter import TreeSitterHighlighter # noqa: F401, E402 diff --git a/albero/tokens.py b/albero/tokens.py deleted file mode 100644 index 3f2b9bf..0000000 --- a/albero/tokens.py +++ /dev/null @@ -1,71 +0,0 @@ -Token = tuple[tuple[int, int], int, str] - -generic_tokens: list[str] = [ - "Whitespace", - "Text", - "Error", - "Keyword", - "Name", - "String", - "Number", - "Literal", - "Operator", - "Punctuation", - "Comment", - "Generic", -] - - -def only_tokens_in_text_range( - tokens: list[Token], text_range: tuple[int, int] -) -> list[Token]: - # We create a new list because lists are pass by reference - output_tokens: list[Token] = [] - - for token in tokens: - token_lineno: int = token[0][0] - minimum_line: int = text_range[0] - maximum_line: int = text_range[1] - - if token_lineno < minimum_line or token_lineno > maximum_line: - continue - - output_tokens.append(token) - - output_tokens = merge_tokens(output_tokens) - return output_tokens - - -def merge_tokens(tokens: list[Token]) -> list[Token]: - output_tokens: list[Token] = [] - depth: int = 0 - for token in tokens: - # Deal with basic edge case - if depth == 0: - output_tokens.append(token) - depth += 1 - continue - - previous_token = output_tokens[-1] - - # Get our boolean checks - same_token_type: bool = previous_token[2] == token[2] - same_line: bool = previous_token[0][0] == token[0][0] - neighboring_tokens: bool = ( - previous_token[0][1] + previous_token[1] == token[0][1] - ) - - # Determine if tokens should be merged - if not (same_token_type and same_line and neighboring_tokens): - output_tokens.append(token) - depth += 1 - continue - - # Replace previous token with new token (we don't increase depth because we are substituting, not adding) - new_token: Token = ( - (token[0][0], previous_token[0][1]), - previous_token[1] + token[1], - token[2], - ) - output_tokens[-1] = new_token - return output_tokens diff --git a/albero/tree_sitter.py b/albero/tree_sitter.py index 9a6b70e..dbdb1d1 100644 --- a/albero/tree_sitter.py +++ b/albero/tree_sitter.py @@ -1,9 +1,9 @@ from logging import Logger, getLogger +from token_tools import Token, only_tokens_in_text_range from tree_sitter import Language, Parser, Tree from .misc import AlberoException, normal_text_range -from .tokens import Token, only_tokens_in_text_range from .tree_sitter_funcs import edit_tree, node_to_tokens TreeAndParser = tuple[Tree, Parser, str] # Tree, Parser, code diff --git a/albero/tree_sitter_funcs.py b/albero/tree_sitter_funcs.py index aef29d5..a693c7a 100644 --- a/albero/tree_sitter_funcs.py +++ b/albero/tree_sitter_funcs.py @@ -1,9 +1,8 @@ from logging import Logger +from token_tools import Token, merge_tokens from tree_sitter import Node, Parser, Tree, TreeCursor -from .tokens import Token, merge_tokens - def node_to_tokens( root_node: Node | Tree, mapping: dict[str, str], logger: Logger diff --git a/docs/source/variables.rst b/docs/source/variables.rst index b22d158..21db362 100644 --- a/docs/source/variables.rst +++ b/docs/source/variables.rst @@ -2,18 +2,9 @@ Variables ========= -.. _Token Overview: +.. _Token and GENERIC_TOKENS Overview: -``Token`` -********* +``Token`` and ``GENERIC_TOKENS`` +******************************** -The ``Token`` type is, in reality, just a type alias of ``tuple[tuple[int, int], int, str]``. Despite simply being a ``tuple``, the ``Token`` is the most frequently returned data type. That being said, what data does it signify? - -The ``Token`` type contains three parts: the start index, its length, and its type. The start index is that ``tuple`` at the beginning of the main ``tuple`` and the first index of that is the line the ``Token`` takes place on and the second is the column. ``Token``'s start at index ``1, 0`` so you may need to do a -1 or a +1 depending on how you need to use this data. The second ``int`` is the length of the ``Token` and the ``str`` is the type. You will use these very often so its never a bad idea to get familiar with them. - -.. _Generic Tokens Overview: - -``generic_tokens`` -****************** - -The ``GENERIC_TOKENS`` ``list`` provides all the generic ``Token`` types you will encounter when using ``Albero``. Simply print this out and you will know all possible ``Token`` types and will never be surprised by a random ``Token`` type again. +See `Token Tools Documentation `_ for details. Note that ``Token`` is a very common return type used by ``Salve``. diff --git a/examples/example_mapping_creation.py b/examples/example_mapping_creation.py index 311e1dc..e044049 100644 --- a/examples/example_mapping_creation.py +++ b/examples/example_mapping_creation.py @@ -2,7 +2,7 @@ from tree_sitter import Language -from albero import Token, TreeSitterHighlighter, generic_tokens, get_lang +from albero import GENERIC_TOKENS, Token, TreeSitterHighlighter, get_lang # Logging basicConfig( @@ -12,7 +12,7 @@ # Useful stuff for highlighting archaic_lang: Language = get_lang("archaic_language") -print(generic_tokens) +print(GENERIC_TOKENS) custom_mapping: dict[str, str] = { "xyz_tree_sitter_stuff": "token_in_generic_tokens" # See python mapping in albero source } diff --git a/requirements-dev.txt b/requirements-dev.txt index 561fa19..5c5703c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,6 +1,7 @@ # Normal requirements beartype tree-sitter +token_tools # Languages tree-sitter-css diff --git a/requirements.txt b/requirements.txt index 708fffc..dec83f8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ beartype tree-sitter +token_tools # Languages tree-sitter-css