WeChatDataAnalysis/generate_config_template.py at master · Johntang666/WeChatDataAnalysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
#!/usr/bin/env python3
"""
生成微信数据库字段配置模板
基于实际数据库结构生成JSON模板，供人工填写字段含义
"""

import sqlite3
import json
from pathlib import Path
from typing import Dict, List, Any
from collections import defaultdict
import re

class ConfigTemplateGenerator:
    """配置模板生成器"""

    def __init__(self, databases_path: str = "output/databases"):
        """初始化生成器

        Args:
            databases_path: 数据库文件路径
        """
        self.databases_path = Path(databases_path)
        self.template_structure = {}

    def connect_database(self, db_path: Path) -> sqlite3.Connection:
        """连接SQLite数据库"""
        try:
            conn = sqlite3.connect(str(db_path))
            return conn
        except Exception as e:
            print(f"连接数据库失败 {db_path}: {e}")
            return None

    def detect_similar_table_patterns(self, table_names: List[str]) -> Dict[str, List[str]]:
        """检测相似的表名模式（与主脚本逻辑一致）"""
        patterns = defaultdict(list)

        for table_name in table_names:
            # 检测 前缀_后缀 模式，其中后缀是32位或更长的哈希字符串
            if '_' in table_name:
                parts = table_name.split('_', 1)  # 只分割第一个下划线
                if len(parts) == 2:
                    prefix, suffix = parts
                    # 检查后缀是否像哈希值（长度>=16的十六进制字符串）
                    if len(suffix) >= 16 and all(c in '0123456789abcdefABCDEF' for c in suffix):
                        patterns[prefix].append(table_name)

        # 只返回有多个表的模式
        return {prefix: tables for prefix, tables in patterns.items() if len(tables) > 1}

    def compare_table_structures(self, conn: sqlite3.Connection, table_names: List[str]) -> Dict[str, Any]:
        """比较多个表的结构是否相同（与主脚本逻辑一致）"""
        if not table_names:
            return {'are_identical': False, 'representative_table': None}

        try:
            cursor = conn.cursor()
            structures = {}

            # 获取每个表的结构
            for table_name in table_names:
                try:
                    cursor.execute(f"PRAGMA table_info({table_name})")
                    columns = cursor.fetchall()

                    # 标准化字段信息用于比较
                    structure = []
                    for col in columns:
                        structure.append({
                            'name': col[1],
                            'type': col[2].upper(),  # 统一大小写
                            'notnull': col[3],
                            'pk': col[5]
                        })

                    structures[table_name] = structure
                except Exception as e:
                    print(f"获取表结构失败 {table_name}: {e}")
                    continue

            if not structures:
                return {'are_identical': False, 'representative_table': None}

            # 比较所有表结构
            first_table = list(structures.keys())[0]
            first_structure = structures[first_table]

            are_identical = True

            for table_name, structure in structures.items():
                if table_name == first_table:
                    continue

                if len(structure) != len(first_structure):
                    are_identical = False
                    break

                for i, (field1, field2) in enumerate(zip(first_structure, structure)):
                    if field1 != field2:
                        are_identical = False
                        break

                if not are_identical:
                    break

            return {
                'are_identical': are_identical,
                'representative_table': first_table,
                'structure': first_structure,
                'table_count': len(structures),
                'table_names': list(structures.keys())
            }

        except Exception as e:
            print(f"比较表结构失败: {e}")
            return {'are_identical': False, 'representative_table': None}

    def analyze_database_structure(self, db_path: Path) -> Dict[str, Any]:
        """分析单个数据库结构"""
        db_name = db_path.stem
        print(f"分析数据库结构: {db_name}")

        conn = self.connect_database(db_path)
        if not conn:
            return {}

        try:
            cursor = conn.cursor()

            # 获取所有表名
            cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
            tables = cursor.fetchall()
            table_names = [table[0] for table in tables]

            # 检测相似表并分组
            similar_patterns = self.detect_similar_table_patterns(table_names)
            processed_tables = set()
            db_structure = {}

            # 处理相似表组
            for prefix, pattern_tables in similar_patterns.items():
                print(f"  检测到相似表模式 {prefix}_*: {len(pattern_tables)} 个表")

                # 比较表结构
                comparison = self.compare_table_structures(conn, pattern_tables)

                if comparison['are_identical']:
                    print(f"    → 表结构完全相同，使用代表表: {comparison['representative_table']}")
                    # 使用模式名作为键，记录代表表的字段
                    representative_table = comparison['representative_table']
                    table_key = f"{prefix}_*"  # 使用模式名

                    # 获取代表表的字段信息
                    cursor.execute(f"PRAGMA table_info({representative_table})")
                    columns = cursor.fetchall()

                    fields = {}
                    for col in columns:
                        field_name = col[1]
                        field_type = col[2]
                        fields[field_name] = {
                            "type": field_type,
                            "meaning": "",  # 留空供用户填写
                            "notes": f"字段类型: {field_type}"
                        }

                    db_structure[table_key] = {
                        "type": "similar_group",
                        "pattern": f"{prefix}_{{hash}}",
                        "table_count": comparison['table_count'],
                        "representative_table": representative_table,
                        "description": "",  # 留空供用户填写
                        "fields": fields
                    }

                    # 标记这些表已被处理
                    processed_tables.update(pattern_tables)
                else:
                    print(f"    → 表结构不同，保持独立处理")

            # 处理剩余的独立表
            for table in tables:
                table_name = table[0]

                if table_name in processed_tables:
                    continue

                try:
                    # 获取表字段信息
                    cursor.execute(f"PRAGMA table_info({table_name})")
                    columns = cursor.fetchall()

                    fields = {}
                    for col in columns:
                        field_name = col[1]
                        field_type = col[2]
                        fields[field_name] = {
                            "type": field_type,
                            "meaning": "",  # 留空供用户填写
                            "notes": f"字段类型: {field_type}"
                        }

                    db_structure[table_name] = {
                        "type": "table",
                        "description": "",  # 留空供用户填写
                        "fields": fields
                    }

                except Exception as e:
                    print(f"    处理表 {table_name} 失败: {e}")
                    continue

            return db_structure

        except Exception as e:
            print(f"分析数据库失败 {db_name}: {e}")
            return {}
        finally:
            conn.close()

    def generate_template(self, output_file: str = "wechat_db_config_template.json"):
        """生成配置模板"""
        print("开始生成微信数据库配置模板...")

        # 定义要排除的数据库模式和描述
        excluded_patterns = {
            r'biz_message_\d+\.db$': '企业微信聊天记录数据库',
            r'bizchat\.db$': '企业微信联系人数据库',
            r'contact_fts\.db$': '搜索联系人数据库',
            r'favorite_fts\.db$': '搜索收藏数据库'
        }

        # 查找所有数据库文件
        all_db_files = []
        for account_dir in self.databases_path.iterdir():
            if account_dir.is_dir():
                for db_file in account_dir.glob("*.db"):
                    all_db_files.append(db_file)

        print(f"找到 {len(all_db_files)} 个数据库文件")

        # 过滤数据库文件
        db_files = []
        excluded_files = []

        for db_file in all_db_files:
            db_filename = db_file.name
            excluded_info = None

            for pattern, description in excluded_patterns.items():
                if re.match(pattern, db_filename):
                    excluded_files.append((db_file, description))
                    excluded_info = description
                    break

            if excluded_info is None:
                db_files.append(db_file)

        # 显示排除的数据库
        if excluded_files:
            print(f"\n排除以下数据库文件（{len(excluded_files)} 个）：")
            for excluded_file, description in excluded_files:
                print(f"  - {excluded_file.name} ({description})")

        print(f"\n实际处理 {len(db_files)} 个数据库文件")

        # 过滤message数据库，只保留倒数第二个（与主脚本逻辑一致）
        message_numbered_dbs = []
        message_other_dbs = []

        for db in db_files:
            if re.match(r'message_\d+$', db.stem):  # message_{数字}.db
                message_numbered_dbs.append(db)
            elif db.stem.startswith('message_'):  # message_fts.db, message_resource.db等
                message_other_dbs.append(db)

        if len(message_numbered_dbs) > 1:
            # 按数字编号排序（提取数字进行排序）
            message_numbered_dbs.sort(key=lambda x: int(re.search(r'message_(\d+)', x.stem).group(1)))
            # 选择倒数第二个（按编号排序）
            selected_message_db = message_numbered_dbs[-2]  # 倒数第二个
            print(f"检测到 {len(message_numbered_dbs)} 个message_{{数字}}.db数据库")
            print(f"选择倒数第二个: {selected_message_db.name}")

            # 从db_files中移除其他message_{数字}.db数据库，但保留message_fts.db等
            db_files = [db for db in db_files if not re.match(r'message_\d+$', db.stem)]
            db_files.append(selected_message_db)

        print(f"实际分析 {len(db_files)} 个数据库文件")

        # 生成模板结构
        template = {
            "_metadata": {
                "description": "微信数据库字段配置模板",
                "version": "1.0",
                "instructions": {
                    "zh": "请为每个字段的 'meaning' 填入准确的中文含义，'description' 填入数据库/表的功能描述",
                    "en": "Please fill in accurate Chinese meanings for each field's 'meaning' and functional descriptions for 'description'"
                },
                "database_count": len(db_files),
                "generated_time": __import__('datetime').datetime.now().isoformat()
            },
            "databases": {}
        }

        # 分析每个数据库
        for db_file in db_files:
            db_structure = self.analyze_database_structure(db_file)
            if db_structure:
                template["databases"][db_file.stem] = {
                    "description": "",  # 留空供用户填写
                    "file_size": db_file.stat().st_size,
                    "tables": db_structure
                }

        # 添加额外的配置项
        template["message_types"] = {
            "_instructions": "消息类型映射 - 格式: 'Type,SubType': '含义描述'",
            "examples": {
                "1,0": "文本消息",
                "3,0": "图片消息",
                "34,0": "语音消息"
            }
        }

        template["friend_types"] = {
            "_instructions": "好友类型映射 - 格式: 'TypeCode': '类型描述'",
            "examples": {
                "1": "好友",
                "2": "微信群",
                "3": "好友"
            }
        }

        # 写入模板文件
        output_path = Path(output_file)
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(template, f, ensure_ascii=False, indent=2)

        print(f"\n配置模板生成完成: {output_file}")
        print(f"  - 数据库数量: {len(template['databases'])}")

        # 统计信息
        total_tables = 0
        total_fields = 0
        similar_groups = 0

        for db_name, db_info in template["databases"].items():
            db_tables = len(db_info["tables"])
            total_tables += db_tables

            for table_name, table_info in db_info["tables"].items():
                if table_info["type"] == "similar_group":
                    similar_groups += 1
                total_fields += len(table_info["fields"])

        print(f"  - 表数量: {total_tables}")
        print(f"  - 相似表组: {similar_groups}")
        print(f"  - 字段总数: {total_fields}")

        # 显示完成统计信息
        if excluded_files:
            print(f"\n生成完成统计：")
            print(f"  - 成功处理: {len(template['databases'])} 个数据库")
            print(f"  - 排除数据库: {len(excluded_files)} 个")
            print(f"  - 排除原因: 个人微信数据分析不需要企业微信和搜索索引数据")

        print(f"\n请编辑 {output_file} 文件，填入准确的字段含义和描述")

def main():
    """主函数"""
    print("微信数据库配置模板生成器")
    print("=" * 50)

    generator = ConfigTemplateGenerator()
    generator.generate_template()

if __name__ == "__main__":
    main()