-
-
Notifications
You must be signed in to change notification settings - Fork 91
Open
Description
input
<!doctype html><hr>
result: compound nodes are prefixed with #
# node 25 = fragment: '<!doctype html><hr>'
# node 26 = doctype: '<!doctype html>'
node 1 = <!: '<!' -> '<!'
node 4 = doctype: 'doctype' -> 'doctype'
node 3 = >: '>' -> ' html>'
# node 28 = element: '<hr>'
# node 31 = start_tag: '<hr>'
node 5 = <: '<' -> '<'
node 17 = tag_name: 'hr' -> 'hr'
node 3 = >: '>' -> '>'
problem: the 'html'
in '<!doctype html>'
has no parse node
and the close tag '>'
of '<!doctype html>'
has the same node type as the close tag '>'
of '<hr>'
note how ' html'
spills into '>'
with node_source = input_html[last_node_to:node.range.end_byte]
node 1 = <!: '<!' -> '<!'
node 4 = doctype: 'doctype' -> 'doctype'
node 3 = >: '>' -> ' html>'
this is causing problems in a semantic stage using this parser
where i want to ...
either ignore the compound node '<!doctype html>'
and process its child nodes '<!'
and 'doctype'
and 'html'
and '>'
or process the compound node and ignore its child nodes
the cheap solution would be
to use a different node type for '>'
of '<!doctype html>'
# https://github.com/tree-sitter/py-tree-sitter/issues/33
#def traverse_tree(tree: Tree):
def walk_html_tree(tree, func):
# compound tags
# these are ignored when serializing the tree
compound_kind_id = [
25, # fragment
26, # doctype
#1, # '<!'
#3, # '>'
28, # element
29, # script_element
30, # style_element
31, # start_tag
34, # self_closing_tag
35, # end_tag
37, # attribute
38, # quoted_attribute_value
#14, # double quote '"'
#12, # single quote "'"
#10, # attribute_value
]
cursor = tree.walk()
reached_root = False
while reached_root == False:
is_compound = cursor.node.kind_id in compound_kind_id
func(cursor.node, is_compound)
if cursor.goto_first_child():
continue
if cursor.goto_next_sibling():
continue
retracing = True
while retracing:
if not cursor.goto_parent():
retracing = False
reached_root = True
if cursor.goto_next_sibling():
retracing = False
last_node_to = 0
input_html = """<!doctype html><hr>"""
def walk_callback(node, is_compound):
nonlocal walk_html_tree_test_result, last_node_to
s = repr(node.text.decode("utf8"))
if len(s) > 50:
s = s[0:50] + "..."
if not is_compound:
node_source = input_html[last_node_to:node.range.end_byte]
last_node_to = node.range.end_byte
node_source = node_source.decode("utf8")
if len(node_source) > 50:
node_source = node_source[0:50] + "..."
print(f"node {node.kind_id} = {node.type}: {s} -> {repr(node_source)}")
else:
print(f"# node {node.kind_id} = {node.type}: {s}")
import tree_sitter
import tree_sitter_languages
tree_sitter_html = tree_sitter_languages.get_parser("html")
html_parser = tree_sitter_html
html_tree = html_parser.parse(input_html)
top_node = html_tree.root_node
walk_html_tree(top_node, walk_callback)
Metadata
Metadata
Assignees
Labels
No labels