Skip to content

doctype: parse all child nodes #83

@milahu

Description

@milahu

input

<!doctype html><hr>

result: compound nodes are prefixed with #

# node 25 = fragment: '<!doctype html><hr>'
# node 26 = doctype: '<!doctype html>'
node 1 = <!: '<!' -> '<!'
node 4 = doctype: 'doctype' -> 'doctype'
node 3 = >: '>' -> ' html>'
# node 28 = element: '<hr>'
# node 31 = start_tag: '<hr>'
node 5 = <: '<' -> '<'
node 17 = tag_name: 'hr' -> 'hr'
node 3 = >: '>' -> '>'

problem: the 'html' in '<!doctype html>' has no parse node
and the close tag '>' of '<!doctype html>'
has the same node type as the close tag '>' of '<hr>'

note how ' html' spills into '>'
with node_source = input_html[last_node_to:node.range.end_byte]

node 1 = <!: '<!' -> '<!'
node 4 = doctype: 'doctype' -> 'doctype'
node 3 = >: '>' -> ' html>'

this is causing problems in a semantic stage using this parser
where i want to ...

either ignore the compound node '<!doctype html>'
and process its child nodes '<!' and 'doctype' and 'html' and '>'

or process the compound node and ignore its child nodes

the cheap solution would be
to use a different node type for '>' of '<!doctype html>'

# https://github.com/tree-sitter/py-tree-sitter/issues/33
#def traverse_tree(tree: Tree):
def walk_html_tree(tree, func):
    # compound tags
    # these are ignored when serializing the tree
    compound_kind_id = [
        25, # fragment
        26, # doctype
        #1, # '<!'
        #3, # '>'
        28, # element
        29, # script_element
        30, # style_element
        31, # start_tag
        34, # self_closing_tag
        35, # end_tag
        37, # attribute
        38, # quoted_attribute_value
        #14, # double quote '"'
        #12, # single quote "'"
        #10, # attribute_value
    ]
    cursor = tree.walk()
    reached_root = False
    while reached_root == False:
        is_compound = cursor.node.kind_id in compound_kind_id
        func(cursor.node, is_compound)
        if cursor.goto_first_child():
            continue
        if cursor.goto_next_sibling():
            continue
        retracing = True
        while retracing:
            if not cursor.goto_parent():
                retracing = False
                reached_root = True
            if cursor.goto_next_sibling():
                retracing = False

last_node_to = 0

input_html = """<!doctype html><hr>"""

def walk_callback(node, is_compound):
    nonlocal walk_html_tree_test_result, last_node_to

    s = repr(node.text.decode("utf8"))
    if len(s) > 50:
        s = s[0:50] + "..."

    if not is_compound:
        node_source = input_html[last_node_to:node.range.end_byte]
        last_node_to = node.range.end_byte
        node_source = node_source.decode("utf8")
        if len(node_source) > 50:
            node_source = node_source[0:50] + "..."
        print(f"node {node.kind_id} = {node.type}: {s} -> {repr(node_source)}")
    else:
        print(f"# node {node.kind_id} = {node.type}: {s}")

import tree_sitter
import tree_sitter_languages

tree_sitter_html = tree_sitter_languages.get_parser("html")
html_parser = tree_sitter_html

html_tree = html_parser.parse(input_html)
top_node = html_tree.root_node

walk_html_tree(top_node, walk_callback)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions