File size: 3,435 Bytes
8496edd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import json
import re
import pypandoc

# A sample Markdown string
markdown_text = """
# My Document

Some **bold** text here, and some *italic* text there.

- Bullet point 1
- Bullet point 2
"""


def markdown_to_latex(markdown_text):
    # Convert Markdown string to LaTeX
    latex_text = pypandoc.convert_text(markdown_text, to='latex', format='md')
    return latex_text


def markdown_to_json_method(markdown_text):
    # 初始化根节点和层级堆栈,初始层级设为 0,以便支持一级标题
    root = {"method_class": "root", "children": []}
    stack = [{"node": root, "level": 0}]  # 用堆栈跟踪层级关系
    
    lines = markdown_text.strip().split('\n')
    i = 0
    
    while i < len(lines):
        line = lines[i].strip()
        i += 1
        
        if not line:
            continue
        
        # 匹配标题
        if line.startswith('#'):
            match = re.match(r'^(#+)\s*(.*?)$', line)
            if not match:
                continue
            hashes, method_class = match.groups()
            current_level = len(hashes)
            
            # 创建新节点
            new_node = {"method_class": method_class, "children": [], "description": ""}
            
            # 寻找合适的父节点
            while stack and stack[-1]["level"] >= current_level:
                stack.pop()
            
            # 如果没有找到合适的父节点,则将 new_node 加入到 root 下
            if stack:
                parent = stack[-1]["node"]
            else:
                parent = root
            parent["children"].append(new_node)
            
            # 更新堆栈
            stack.append({"node": new_node, "level": current_level})
            
            # 查找紧随标题后的描述文本
            description_lines = []
            while i < len(lines) and lines[i].strip() and not lines[i].strip().startswith('#') and not lines[i].strip().startswith('-'):
                description_lines.append(lines[i].strip())
                i += 1
            
            if description_lines:
                new_node["description"] = " ".join(description_lines)
            
            # 回退一行,因为下一行可能是列表项或新标题
            if i < len(lines):
                i -= 1
        
        # 匹配列表项
        elif line.startswith('-'):
            item = {}
            if ': ' in line:
                method, description = line[1:].strip().split(': ', 1)
                description = description
                item = {"method": method.strip(), "description": description.strip()}
            else:
                item = {"method": line[1:].strip(), "description": ""}
            
            # 添加到当前层级的子节点;若无标题节点,则直接添加到 root
            if stack:
                current_node = stack[-1]["node"]
                current_node.setdefault("children", []).append(item)
            else:
                root.setdefault("children", []).append(item)
    
    # 返回所有解析到的顶级标题节点
    return root["children"]


if __name__ == "__main__":
    with open("../data/actor_data/docs/method_en_v1.md", "r", encoding="utf-8") as f:
        markdown_text = f.read()

    result = markdown_to_json_method(markdown_text)
    print(json.dumps(result, indent=2, ensure_ascii=False))


    # AIzaSyCfcnYh7jBDnjP7kex7HEj4rpUpHRxvM_0