File size: 11,920 Bytes
ba441c8
 
 
 
cd9e4db
 
ba441c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de3a936
1bd5bd5
de3a936
f863d85
453450e
1bd5bd5
de3a936
fadd831
 
ba441c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9cec7bc
ba441c8
9cec7bc
 
 
ba441c8
 
 
 
 
 
 
 
 
 
 
 
 
992cb92
4b4a7f2
 
 
 
ba441c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b98fea
0affda0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b98fea
0affda0
 
 
 
 
 
0bd8dc5
 
0affda0
0b98fea
 
 
ba441c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1be2b0d
561487b
1be2b0d
 
561487b
ba441c8
 
 
 
 
 
 
 
 
 
 
 
 
561487b
ba441c8
 
 
 
 
1be2b0d
561487b
0469fea
1be2b0d
561487b
ba441c8
a7388ef
139b2fd
 
ba441c8
 
c315a9a
 
 
1bd5bd5
0affda0
 
 
 
 
 
 
0b98fea
c315a9a
ca0a5c0
beccc1b
73b633b
ba441c8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
'''
基于向量聚类方式的可视化模块
'''
# cluster_insight.py


import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import matplotlib.cm as cm
import matplotlib.colors as mcolors
# from wordcloud import WordCloud
import matplotlib.pyplot as plt
import base64
from io import BytesIO
from sentence_transformers import SentenceTransformer, util
import os
import pickle
# cluster_insight.py → 新增函数
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba

# ========== 核心配置(和之前一致) ==========
import plotly.io as pio
pio.kaleido.scope.default_font = "Noto Sans CJK SC"

FONT_FILE_PATH = "./SourceHanSansCN-Light.otf"  # 字体文件在根目录
# CHINESE_FONT = "Source Han Sans CN Light"
CHINESE_FONT = "Noto Sans CJK SC"
    
# CHINESE_FONT = "Noto Sans SC" # 思源黑体(跨平台兼容,Plotly 自带)

def extract_cluster_keywords_auto(sentences, labels, cluster_id, top_n=3):
    """
    自动提取聚类关键词
    :param sentences: 所有句子
    :param labels: 聚类标签
    :param cluster_id: 当前聚类
    :param top_n: 提取前 n 个词
    :return: 关键词字符串
    """
    # 1. 提取该聚类所有句子
    cluster_texts = [sentences[i] for i in range(len(sentences)) if labels[i] == cluster_id]
    if not cluster_texts:
        return "无数据"

    # 2. 分词(保护领域词)
    DOMAIN_SET = {
    # 中文通用领域词(去重 + 合并)
    "空间连接", "字段计算器", "建筑面积", "城市规划", "叠加分析", "空间连接功能",
    "数据表", "建筑层数", "地理处理", "相交功能", "现状地块", "相交叠加",
    "地块属性", "分地块", "容积率统计", "计算方法", "参数设置", "软件设置",
    "核密度分析", "热点分析", "带宽", "密度场", "焦点", "焦点统计",
    "地图代数", "条件分析", "差运算", "最大值", "交通", "像元大小",
    "参数", "凸包", "餐饮", "住宿", "搜索半径", "栅格计算器", "重分类", "Con函数",

    # 英文通用领域词(去重)
    "ArcGIS", "spatial join", "ArcMap", "Map algebra", "Kernel Density",
    "Con", "Getis - Ord Gi*", "NDVI", "Raster Calculator", "dwg", "catalog",
    "spatial join", "data manager", "POI",
    }
    for word in DOMAIN_SET:
        jieba.add_word(word, freq=10000)
        # 1. 基础停用词(通用功能词)
    STOPWORDS = {"的", "了", "在", "是", "是否", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这", "那", "这个", "那个", "什么",
                 "怎么", "哪里", "时候", "然后", "可能", "应该", "可以", "就是", "还是", "但是", "不过", "如果", "因为", "所以", "而且", "或者", "其实", "觉得", "认为", "希望", "能够", "需要", "知道", "表示", "这样", "那样", "这些", "那些", "有点",
                 "一点", "一些", "进一步", "具体", "问题", "疑惑", "讲解", "需求", "难点", "操作", "应用", "场景", "对于", "进行", "实际", "情况", "结合",
                 "学生", "老师", "实验", "报告", "作业", "课程", "课堂", "学习", "理解", "掌握", "明白", "清楚",
                 "建议", "希望", "请问", "想问", "不懂", "不会", "不知道", "不太会", "不太懂", "不太清楚",
                 }
    def tokenize(text):
        words = jieba.lcut(text)
        return [
            w for w in words
            if len(w) > 1 and w not in STOPWORDS and not w.isdigit()
        ]

    tokenized = [" ".join(tokenize(text)) for text in cluster_texts]

    # 3. TF-IDF 提取关键词
    vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1, 2))
    try:
        tfidf_matrix = vectorizer.fit_transform(tokenized)
        feature_names = vectorizer.get_feature_names_out()

        # 取平均 TF-IDF 最高的词
        mean_tfidf = tfidf_matrix.mean(axis=0).A1
        top_indices = mean_tfidf.argsort()[-top_n:][::-1]
        keywords = [feature_names[i] for i in top_indices]
        return " | ".join(keywords)
    except:
        return "关键词提取失败"


# model_path = r'.\sbert\models--shibing624--text2vec-base-chinese\snapshots\183bb99aa7af74355fb58d16edf8c13ae7c5433e'
# 从本地加载模型
model_dir = os.path.join("sbert", "models--shibing624--text2vec-base-chinese", "snapshots", "183bb99aa7af74355fb58d16edf8c13ae7c5433e")
# model = SentenceTransformer(model_dir)
MODEL = SentenceTransformer(model_dir)

def encode_sentences_with_cache(sentences, model): #, cache_path='sentence_vectors.pkl'
    """
    使用 BERT 编码句子,并支持本地缓存
    """
    # if os.path.exists(cache_path):
    #     print(f"已找到缓存文件:{cache_path},正在加载向量...")
    #     with open(cache_path, 'rb') as f:
    #         sentence_vectors = pickle.load(f)
    # else:
    #     print("未找到缓存,开始编码...")
    sentence_vectors = model.encode(
        sentences,
        batch_size=32,
        show_progress_bar=False,
        convert_to_tensor=False,
        normalize_embeddings=True,       # 直接归一化
        device="cpu"                     # 明确指定 cpu
    )
        # print(f"编码完成,保存到:{cache_path}")
        # with open(cache_path, 'wb') as f:
        #     pickle.dump(sentence_vectors, f)

    return sentence_vectors


def auto_select_k(embeddings, max_k=10):
    """自动选择最佳聚类数(轮廓系数最高)"""
    sil_scores = []
    k_range = range(2, min(max_k + 1, len(embeddings) // 2))
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = kmeans.fit_predict(embeddings)
        sil_scores.append(silhouette_score(embeddings, labels))

    best_k = k_range[np.argmax(sil_scores)]
    print(f"自动选择最佳聚类数: k = {best_k} (轮廓系数: {max(sil_scores):.3f})")
    return best_k


def fig_to_base64(fig):
    # 关键修复:转图片前,强制给 fig 中所有文本设置注册字体(覆盖所有场景)
    fig.update_layout(
        # 全局字体(标题、坐标轴、图例等)
        font=dict(family=CHINESE_FONT, size=18),
        # 标题字体(单独强化,避免被覆盖)
        titlefont=dict(family=CHINESE_FONT, size=22),
        # 坐标轴标签字体(如果有坐标轴,必须显式指定)
        xaxis=dict(titlefont=dict(family=CHINESE_FONT), tickfont=dict(family=CHINESE_FONT)),
        yaxis=dict(titlefont=dict(family=CHINESE_FONT), tickfont=dict(family=CHINESE_FONT)),
        # 图例字体(如果有图例,补充)
        legend=dict(font=dict(family=CHINESE_FONT))
    )
    
    # 处理图表中的所有 trace(比如桑基图节点、散点图文本等)
    for trace in fig.data:
        # 桑基图节点文本
        if hasattr(trace, "node") and hasattr(trace.node, "font"):
            trace.node.font.family = CHINESE_FONT
        # 散点图/折线图的文本标签(比如聚类中心的关键词)
        if hasattr(trace, "textfont"):
            trace.textfont.family = CHINESE_FONT
        # Hover 提示文本(强化,避免遗漏)
        if hasattr(trace, "hoverlabel") and hasattr(trace.hoverlabel, "font"):
            trace.hoverlabel.font.family = CHINESE_FONT
    
    # 原有转 Base64 逻辑不变
    buffer = BytesIO()
    fig.write_image(
        buffer,
        format="png",
        engine="kaleido",
        width=900,
        height=600,
        scale=2,  # 提高清晰度,同时确保字体渲染更稳定
        validate=False
    )
    buffer.seek(0)
    return base64.b64encode(buffer.read()).decode("ascii")

def cluster_and_visualize(
        excel_path: str,
        questions=['s1', 's2', 's3', 's4'],
        max_k=15
    ):
    """
    输入:Excel + 句向量 pkl
    输出:(聚类图 base64, 统计信息 dict)
    """
    # 1. 加载数据
    df = pd.read_excel(excel_path)
    sentences = []
    meta = []
    for idx, row in df.iterrows():
        for q in questions:
            text = str(row[q]).strip()
            if text:
                sentences.append(text)
                meta.append((row['no'], q))
    emb = encode_sentences_with_cache(sentences, MODEL)
    # with open(pkl_path, 'rb') as f:
    #     emb = pickle.load(f)

    # 2. 自动选 k
    n_clusters = auto_select_k(emb, max_k=max_k)
    # n_clusters = 8

    # 3. 聚类
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10).fit(emb)
    labels = kmeans.labels_
    closest = np.argmin(np.linalg.norm(emb - kmeans.cluster_centers_[:, np.newaxis], axis=2), axis=1)

    # 在聚类后,替换人工关键词
    cluster_keywords_auto = []
    for i in range(n_clusters):
        kw = extract_cluster_keywords_auto(sentences, labels, i, top_n=5)
        cluster_keywords_auto.append(kw)

    # 4. 统计
    stats = []
    total = len(sentences)
    for i in range(n_clusters):
        cluster_sents = [s for s, l in zip(sentences, labels) if l == i]
        size = len(cluster_sents)
        rep_sent = sentences[closest[i]]
        stats.append({
            'cluster_id': i,
            'size': size,
            'ratio': size / total,
            'rep_sentence': rep_sent,
            'keyword': cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
        })

    # 5. 可视化
    tsne = TSNE(n_components=2, random_state=42)
    emb_2d = tsne.fit_transform(emb)

    cmap = cm.get_cmap('rainbow', n_clusters)
    cluster_colors = [mcolors.rgb2hex(cmap(i)[:3]) for i in range(n_clusters)]
    point_colors = [cluster_colors[l] for l in labels]

    fig = go.Figure()
    # 数据点
    fig.add_trace(go.Scatter(
        x=emb_2d[:, 0], y=emb_2d[:, 1],
        mode='markers',
        marker=dict(size=10, color=point_colors, opacity=0.7),
        text=[f"聚类 {l}" for l in labels],
        hoverinfo='text',
        hoverlabel=dict(
            font=dict(family="Noto Sans CJK SC", size=16),  # hover 字体适配中文
            bgcolor='white'
        ),
        textfont=dict(family="Noto Sans CJK SC"),
        showlegend=False
    ))
    # 聚类中心
    center_x = emb_2d[closest, 0]
    center_y = emb_2d[closest, 1]
    for i, (x, y) in enumerate(zip(center_x, center_y)):
        keyword = cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
        fig.add_trace(go.Scatter(
            x=[x], y=[y],
            mode='markers+text',
            marker=dict(size=30, color=cluster_colors[i], line=dict(width=2, color='black')),
            text=[keyword],
            textposition="top center",
            textfont=dict(family="Noto Sans CJK SC", size=20, color='black'),
            showlegend=False
        ))

    fig.update_layout(
        title="EGISInsight:学生反馈聚类洞察",
        titlefont=dict(
            family="Noto Sans CJK SC",
            size=22
        ),
        font=dict(family="Noto Sans CJK SC", size=18),
        width=900, height=600,
        plot_bgcolor='#F5F5F5',
        autosize=False,   # 关闭自动缩放
        margin=dict(l=50, r=50, t=80, b=50) # 让 Plotly 内部也居中
    )

    img_bytes = fig.to_image(format="png", width=900, height=600, scale=2,
                            engine='kaleido' )  # 新增这一行!
    b64 = base64.b64encode(img_bytes).decode('utf-8')
    # 新代码:
    # img_bytes = pio.to_image(
    #     fig,
    #     format="png",
    #     width=900,
    #     height=600,
    #     scale=2
    #     )
    # b64 = base64.b64encode(img_bytes).decode('utf-8')
    # b64 = fig_to_base64(fig)
    # print(f"{b64}解析成功!")
    # return b64, stats
    return fig, b64, stats