wxy01giser commited on
Commit
3ede521
·
verified ·
1 Parent(s): 87a107e

Update docSim.py

Browse files
Files changed (1) hide show
  1. docSim.py +5 -19
docSim.py CHANGED
@@ -20,25 +20,11 @@ def semantic_similarity(text1, text2):
20
  return float(util.cos_sim(emb1, emb2))
21
 
22
  def calcDocSims(file):
23
- # 替换 JS 风格的 true/false/null 为 Python 可识别格式
24
- file = file.replace("true", "true".lower()).replace("false", "false".lower()).replace("null", "null".lower())
25
- # 1. 去除控制字符(0x00 - 0x1F)
26
- cleaned = re.sub(r'[\x00-\x1F]+', '', file)
27
-
28
- # 2. 替换 JS 风格 true/false/null 为 Python 能识别的形式
29
- cleaned = cleaned.replace("true", "true").replace("false", "false").replace("null", "null")
30
- data = json.loads(cleaned)
31
-
32
- pattern = r"核心痛点[::\s]*([\s\S]*?)优化措施[::\s]*"
33
- res1 = re.search(pattern, data['file'][0][0]['text'], flags=re.S)
34
- res1 = res1.group(1).strip()
35
- res1 = re.sub(r"-?\s*核心教学痛点\d*[::]\s*", "", res1)
36
-
37
- res2 = re.search(pattern, data['file'][1][0]['text'], flags=re.S)
38
- res2 = res2.group(1).strip()
39
- res2 = re.sub(r"-?\s*核心教学痛点\d*[::]\s*", "", res2)
40
- sim = semantic_similarity(res1, res2)
41
- return 1-sim, res1, res2
42
 
43
  if __name__ == '__main__':
44
  s = """
 
20
  return float(util.cos_sim(emb1, emb2))
21
 
22
  def calcDocSims(file):
23
+ file = re.findall(r'text=(.*?),\s*error=', s, flags=re.DOTALL)
24
+ res_list = [extract_core_painpoints(t) for t in file]
25
+ # 假设只比较前两个
26
+ sim = semantic_similarity(res_list[0], res_list[1])
27
+ return 1-sim, res_list[0], res_list[1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  if __name__ == '__main__':
30
  s = """