拉取微信聊天记录

这里推荐这个项目:https://github.com/xaoyaoo/PyWxDump/tree/master
最新的release下载可运行文件即可
解密完成后可以选择要导出的聊天记录,导出为.csv格式

下载停用词stopwords

推荐项目:https://github.com/goto456/stopwords
下载cn_stopwords.txt文件,自行删改

生成代码

默认导出csv文件为utf-8编码

import csv
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import os

def read_csv_files(file_paths):
    all_texts = []
    for file_path in file_paths:
        try:
            with open(file_path, 'r', encoding='utf-8') as csvfile:
                reader = csv.reader(csvfile)
                for row in reader:
                    try:
                        text = row[6]
                        all_texts.append(text)
                    except IndexError:
                        print(f"文件 {file_path} 中某行数据列数不足,已跳过")
                        continue
        except FileNotFoundError:
            print(f"文件 {file_path} 未找到")
        except Exception as e:
            print(f"读取文件 {file_path} 失败: {e}")
    return all_texts

def segment_text(texts):
    seg_list = []
    for text in texts:
        seg_list.extend(jieba.cut(text, cut_all=False))  # 使用精确模式
    return seg_list

def remove_stopwords(words, stopword_file):
    stopwords = set()
    try:
        with open(stopword_file, 'r', encoding='utf-8') as f:
            for line in f:
                stopwords.add(line.strip())
    except FileNotFoundError:
        print(f"停用词文件 {stopword_file} 未找到,将不移除停用词")
        return words
    except Exception as e:
        print(f"读取停用词文件失败: {e},将不移除停用词")
        return words

    filtered_words = [word for word in words if word not in stopwords and len(word) > 1]  # 去除停用词和单字
    return filtered_words

def create_wordcloud(words, output_file, font_path='msyh.ttc'): # 默认使用微软雅黑字体
    text = ' '.join(words)
    try:
        wordcloud = WordCloud(
            font_path=font_path,  # 设置字体,显示中文
            background_color="white",  # 设置背景颜色
            width=800,  # 设置宽度
            height=600,  # 设置高度
            max_words=200,  # 设置最大显示的词语数量
            collocations=False # 避免词语重复组合
        ).generate(text)

        plt.figure(figsize=(10, 8), facecolor='white')
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.tight_layout(pad=0)
        plt.savefig(output_file, dpi=300)  # 保存为图片
        plt.show()
        print(f"词云已保存到 {output_file}")

    except Exception as e:
        print(f"生成词云失败: {e}")


if __name__ == "__main__":
    # 1. 设置CSV文件路径
    csv_file_paths = [
        r'自己的文件路径',
        r'可填多个'
    ]

    # 2. 设置停用词文件路径
    stopword_file = r'停用词文件路径'  # 请替换成你的停用词文件路径,如果没有可以不设置

    # 3. 设置输出词云图片的文件路径
    output_wordcloud_file = r'输出图片文件路径\wordcloud.png'

    # 4. 读取CSV文件
    all_texts = read_csv_files(csv_file_paths)

    # 5. 分词
    words = segment_text(all_texts)

    # 6. 移除停用词
    if os.path.exists(stopword_file):
        filtered_words = remove_stopwords(words, stopword_file)
    else:
        filtered_words = words
        print("未找到停用词文件,跳过停用词移除步骤")

    # 7. 生成词云
    create_wordcloud(filtered_words, output_wordcloud_file)

效果

Image