From 6ff6d9d2550a60dd4cda33d7644aa9ca2e6bf887 Mon Sep 17 00:00:00 2001 From: martinma-2000 Date: Wed, 17 Sep 2025 11:09:47 +0800 Subject: [PATCH 01/12] =?UTF-8?q?=E5=A4=9A=E7=BA=A7=E8=A1=A8=E5=A4=B4->?= =?UTF-8?q?=E5=8D=95=E6=9E=81=E8=A1=A8=E5=A4=B4=E9=80=BB=E8=BE=91=E7=B1=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/test/excel_extract.py | 214 ++++++++++++++++++++++++++++++++++ 1 file changed, 214 insertions(+) create mode 100644 backend/test/excel_extract.py diff --git a/backend/test/excel_extract.py b/backend/test/excel_extract.py new file mode 100644 index 00000000..bda7004b --- /dev/null +++ b/backend/test/excel_extract.py @@ -0,0 +1,214 @@ +import pandas as pd +import re + + +class ExcelHeaderProcessor: + """ + Excel多级表头处理器 + 用于将多级表头的Excel文件转换为单级表头 + """ + + def __init__(self, separator="_"): + """ + 初始化处理器 + + 参数: + separator: 连接符,默认为下划线 + """ + self.separator = separator + + def detect_header_rows(self, file_path, sheet_name=0): + """ + 自动检测Excel文件中的表头行数 + 假设第一行肯定是表头,从第二行开始检测是否为数据行 + 通过检查第一列和第二列是否为非NaN值来判断数据行的开始位置 + + 参数: + file_path: Excel文件路径 + sheet_name: 工作表名称或索引,默认为0(第一个工作表) + + 返回: + header_rows: 表头行数 + """ + # 读取前10行数据用于分析 + df_preview = pd.read_excel(file_path, nrows=10, header=None, sheet_name=sheet_name) + + # 至少有1行表头(第一行) + header_rows = 1 + + # 从第二行开始检查(索引为1) + for index in range(1, len(df_preview)): + row = df_preview.iloc[index] + first_column_value = row.iloc[0] # 第一列的值 + second_column_value = row.iloc[1] # 第二列的值 + + # 如果第一列和第二列都不是NaN,说明这是数据行的开始 + if pd.notna(first_column_value) and pd.notna(second_column_value): + header_rows = index # 表头行数就是数据行的索引(因为索引从0开始) + break + else: + # 如果没有找到明确的数据行,最多认为前3行是表头 + header_rows = min(3, len(df_preview)) + + return header_rows + + def _clean_column_name(self, name): + """ + 清理列名,去除"Unnamed"、"level"、纯数字等无用信息 + + 参数: + name: 原始列名 + + 返回: + cleaned_name: 清理后的列名 + """ + if not isinstance(name, str): + name = str(name) + + # 去除"Unnamed"、"level"、纯数字等无用信息 + # 使用正则表达式匹配这些模式 + patterns_to_remove = [ + r'Unnamed:?\s*\d*', # 匹配 "Unnamed" 或 "Unnamed: 0" 等 + r'level:?\s*\d*', # 匹配 "level" 或 "level 0" 等 + r'^\d+$', # 匹配纯数字 + r'^\s*$', # 匹配纯空格或空字符串 + ] + + cleaned_name = name + for pattern in patterns_to_remove: + cleaned_name = re.sub(pattern, '', cleaned_name, flags=re.IGNORECASE) + + # 去除"_数字"格式的无效字段 + cleaned_name = re.sub(r'_\d+', '', cleaned_name) + + # 去除首尾空格 + cleaned_name = cleaned_name.strip() + + # 如果清理后为空,则返回None + if not cleaned_name: + return None + + return cleaned_name + + def _is_meaningful_part(self, part): + """ + 判断一个部分是否有意义(不是纯数字或特殊无意义字符) + + 参数: + part: 字符串部分 + + 返回: + bool: 是否有意义 + """ + if not part or not isinstance(part, str): + return False + + # 去除首尾空格 + part = part.strip() + + # 如果是空字符串,返回False + if not part: + return False + + # 如果是纯数字,返回False + if part.isdigit(): + return False + + # 如果只包含下划线,返回False + if re.match(r'^_+$', part): + return False + + return True + + def convert_multi_to_single_header(self, file_path, header_rows=None, sheet_name=0): + """ + 将多级表头的Excel文件转换为单级表头 + + 参数: + file_path: Excel文件路径 + header_rows: 表头行数,如果为None则自动检测 + sheet_name: 工作表名称或索引,默认为0(第一个工作表) + + 返回: + 处理后的DataFrame + """ + # 如果未指定表头行数,则自动检测 + if header_rows is None: + header_rows = self.detect_header_rows(file_path, sheet_name) + print(f"检测到表头行数: {header_rows}") + + # 如果检测到的表头行数为0,则至少保留1行作为表头 + if header_rows == 0: + header_rows = 1 + print("检测到表头行数为0,自动调整为1") + + # 读取Excel文件的表头部分 + header_df = pd.read_excel(file_path, header=list(range(header_rows)), sheet_name=sheet_name) + + # 获取列名(多级索引) + multi_columns = header_df.columns + + # 将多级表头转换为单级表头 + single_columns = [] + for col in multi_columns: + # 处理多级表头的每一级 + parts = [] + for level in col: + if pd.notna(level): + # 清理每个层级的名称 + cleaned_level = self._clean_column_name(level) + if cleaned_level: # 只有清理后不为空才添加 + parts.append(cleaned_level) + + # 过滤掉无意义的部分 + meaningful_parts = [part for part in parts if self._is_meaningful_part(part)] + + # 用分隔符连接各级表头 + if not meaningful_parts: + # 如果所有部分都被清理掉了,则使用默认列名 + single_columns.append(f"column_{len(single_columns)}") + else: + # 连接各部分并确保没有多余的分隔符 + column_name = self.separator.join(meaningful_parts) + # 清理可能的多余分隔符 + column_name = re.sub(f'{re.escape(self.separator)}+', self.separator, column_name) # 合并多个连续分隔符 + column_name = column_name.strip(self.separator) # 去除首尾分隔符 + + # 再次清理 "_数字" 格式的无效字段 + column_name = re.sub(r'_\d+', '', column_name) + + # 如果处理后仍然没有有意义的内容,则使用默认列名 + if not column_name or not self._is_meaningful_part(column_name): + single_columns.append(f"column_{len(single_columns)}") + else: + single_columns.append(column_name) + + # 重新读取整个Excel文件 + df = pd.read_excel(file_path, header=list(range(header_rows)), sheet_name=sheet_name) + + # 设置新的单级表头 + df.columns = single_columns + + return df + + +# 使用示例 +if __name__ == "__main__": + excel_file = r"D:\文档-陕农信\测试文件示例\报表1 - 副本 (2).xlsx" + + print("方法1:使用类方式处理") + try: + # 创建处理器实例 + processor = ExcelHeaderProcessor(separator="_") + + # 处理Excel文件 + df = processor.convert_multi_to_single_header(excel_file, header_rows=None) + + print("处理后的数据:") + print(df.head()) + + except Exception as e: + print(f"处理文件时出错: {e}") + import traceback + traceback.print_exc() + From f86755ee70a767eb19a17019c8ba18cce3bff51b Mon Sep 17 00:00:00 2001 From: martinma-2000 Date: Wed, 17 Sep 2025 16:37:06 +0800 Subject: [PATCH 02/12] =?UTF-8?q?=E5=A4=84=E7=90=86=E5=A4=B4=E4=B8=A4?= =?UTF-8?q?=E8=A1=8C=E5=AD=98=E5=9C=A8=E8=A1=A8=E5=90=8D=E5=92=8C=E6=97=B6?= =?UTF-8?q?=E9=97=B4=E8=A1=8C=E7=9A=84=E7=BB=93=E6=9E=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/test/excel_extract.py | 52 +++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/backend/test/excel_extract.py b/backend/test/excel_extract.py index bda7004b..ce7f0b52 100644 --- a/backend/test/excel_extract.py +++ b/backend/test/excel_extract.py @@ -17,6 +17,24 @@ def __init__(self, separator="_"): """ self.separator = separator + def get_name_time(self, file_path, sheet_name=0): + """ + 获取Excel文件的前两行数据,表名和时间 + + 参数: + file_path: Excel文件路径 + sheet_name: 工作表名称或索引,默认为0 + + 返回: + string: 表名, + string: 表格时间 + """ + df = pd.read_excel(file_path, nrows=1, sheet_name=sheet_name) + excel_name = [col for col in df.columns if not col.startswith('Unnamed')] + excel_time = [data for data in df.values[0] if pd.notna(data) and '日期' in data] + excel_time = excel_time[0].split(':')[-1] + return excel_name[0],excel_time + def detect_header_rows(self, file_path, sheet_name=0): """ 自动检测Excel文件中的表头行数 @@ -31,8 +49,7 @@ def detect_header_rows(self, file_path, sheet_name=0): header_rows: 表头行数 """ # 读取前10行数据用于分析 - df_preview = pd.read_excel(file_path, nrows=10, header=None, sheet_name=sheet_name) - + df_preview = pd.read_excel(file_path, nrows=10, header=None, sheet_name=sheet_name,skiprows=[0,1]) # 至少有1行表头(第一行) header_rows = 1 @@ -40,10 +57,9 @@ def detect_header_rows(self, file_path, sheet_name=0): for index in range(1, len(df_preview)): row = df_preview.iloc[index] first_column_value = row.iloc[0] # 第一列的值 - second_column_value = row.iloc[1] # 第二列的值 - - # 如果第一列和第二列都不是NaN,说明这是数据行的开始 - if pd.notna(first_column_value) and pd.notna(second_column_value): + + # 如果第一列不是NaN,说明这是数据行的开始 + if pd.notna(first_column_value): header_rows = index # 表头行数就是数据行的索引(因为索引从0开始) break else: @@ -140,14 +156,13 @@ def convert_multi_to_single_header(self, file_path, header_rows=None, sheet_name # 如果检测到的表头行数为0,则至少保留1行作为表头 if header_rows == 0: header_rows = 1 - print("检测到表头行数为0,自动调整为1") - + #print("检测到表头行数为0,自动调整为1") + adjusted_header_rows = [i + 2 for i in range(header_rows)] # 跳过前两行 # 读取Excel文件的表头部分 - header_df = pd.read_excel(file_path, header=list(range(header_rows)), sheet_name=sheet_name) - + header_df = pd.read_excel(file_path, header=adjusted_header_rows, sheet_name=sheet_name) # 获取列名(多级索引) multi_columns = header_df.columns - + # 将多级表头转换为单级表头 single_columns = [] for col in multi_columns: @@ -184,7 +199,7 @@ def convert_multi_to_single_header(self, file_path, header_rows=None, sheet_name single_columns.append(column_name) # 重新读取整个Excel文件 - df = pd.read_excel(file_path, header=list(range(header_rows)), sheet_name=sheet_name) + df = pd.read_excel(file_path, header=list(range(header_rows)), sheet_name=sheet_name,skiprows=[0, 1]) # 设置新的单级表头 df.columns = single_columns @@ -196,16 +211,19 @@ def convert_multi_to_single_header(self, file_path, header_rows=None, sheet_name if __name__ == "__main__": excel_file = r"D:\文档-陕农信\测试文件示例\报表1 - 副本 (2).xlsx" - print("方法1:使用类方式处理") try: # 创建处理器实例 processor = ExcelHeaderProcessor(separator="_") - + + excel_name,_time = processor.get_name_time(excel_file) + # 处理Excel文件 df = processor.convert_multi_to_single_header(excel_file, header_rows=None) - - print("处理后的数据:") - print(df.head()) + df['表格日期'] = _time + print(df.columns) + for i in df.values: + print(i) + df.to_excel(excel_name+"_单极表头.xlsx", index=False) except Exception as e: print(f"处理文件时出错: {e}") From 25ef0236243f85387269c6057f74d82e4cefed6f Mon Sep 17 00:00:00 2001 From: martinma-2000 Date: Wed, 17 Sep 2025 16:39:16 +0800 Subject: [PATCH 03/12] =?UTF-8?q?=E5=A4=84=E7=90=86=E5=A4=B4=E4=B8=A4?= =?UTF-8?q?=E8=A1=8C=E5=AD=98=E5=9C=A8=E8=A1=A8=E5=90=8D=E5=92=8C=E6=97=B6?= =?UTF-8?q?=E9=97=B4=E8=A1=8C=E7=9A=84=E7=BB=93=E6=9E=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/test/excel_extract.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/test/excel_extract.py b/backend/test/excel_extract.py index ce7f0b52..b131ad35 100644 --- a/backend/test/excel_extract.py +++ b/backend/test/excel_extract.py @@ -220,7 +220,7 @@ def convert_multi_to_single_header(self, file_path, header_rows=None, sheet_name # 处理Excel文件 df = processor.convert_multi_to_single_header(excel_file, header_rows=None) df['表格日期'] = _time - print(df.columns) + for i in df.values: print(i) df.to_excel(excel_name+"_单极表头.xlsx", index=False) From 1d31923dbd5f2971c0298d18d429c0c495f29420 Mon Sep 17 00:00:00 2001 From: martinma-2000 Date: Wed, 17 Sep 2025 17:56:04 +0800 Subject: [PATCH 04/12] =?UTF-8?q?=E5=A4=84=E7=90=86=E5=B0=BE=E8=A1=8C?= =?UTF-8?q?=E6=97=A0=E6=95=88=E5=AD=97=E6=AE=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/test/excel_extract.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/test/excel_extract.py b/backend/test/excel_extract.py index b131ad35..8ec37a88 100644 --- a/backend/test/excel_extract.py +++ b/backend/test/excel_extract.py @@ -209,7 +209,7 @@ def convert_multi_to_single_header(self, file_path, header_rows=None, sheet_name # 使用示例 if __name__ == "__main__": - excel_file = r"D:\文档-陕农信\测试文件示例\报表1 - 副本 (2).xlsx" + excel_file = r"D:\文档-陕农信\测试文件示例\报表1 - 副本.xlsx" try: # 创建处理器实例 @@ -219,6 +219,8 @@ def convert_multi_to_single_header(self, file_path, header_rows=None, sheet_name # 处理Excel文件 df = processor.convert_multi_to_single_header(excel_file, header_rows=None) + if len(df) > 2: + df = df.iloc[:-2] # 删除最后两行 df['表格日期'] = _time for i in df.values: From 95a5d288296b5c50deeda60b3577d91e370f32d7 Mon Sep 17 00:00:00 2001 From: martinma-2000 Date: Thu, 18 Sep 2025 16:17:04 +0800 Subject: [PATCH 05/12] =?UTF-8?q?1=E3=80=81=E6=9B=B4=E6=8D=A2=E7=AB=AF?= =?UTF-8?q?=E5=8F=A38010=202=E3=80=81=E5=88=97=E5=90=8D=E6=9B=B4=E6=94=B9?= =?UTF-8?q?=EF=BC=8C=E5=8E=9F=E5=88=97=E5=90=8D=E5=8F=98=E6=88=90comment?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/apps/datasource/api/datasource.py | 21 +++++++++++++++++++++ backend/apps/db/engine.py | 23 +++++++++++++++++++++-- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/backend/apps/datasource/api/datasource.py b/backend/apps/datasource/api/datasource.py index 69cad28f..9ca07e32 100644 --- a/backend/apps/datasource/api/datasource.py +++ b/backend/apps/datasource/api/datasource.py @@ -307,6 +307,20 @@ def insert_pg(df, tableName, engine): if str(df.dtypes[i]) == 'uint64': df[str(df.columns[i])] = df[str(df.columns[i])].astype('string') + # 生成字母序列作为列名 + def get_column_name(index): + if index < 26: + return chr(ord('A') + index) + else: + return chr(ord('A') + index // 26 - 1) + chr(ord('A') + index % 26) + + # 保存原始列名用于注释 + original_columns = df.columns.tolist() + + # 重命名列名为字母序列 + new_columns = [get_column_name(i) for i in range(len(df.columns))] + df.columns = new_columns + conn = engine.raw_connection() cursor = conn.cursor() try: @@ -316,6 +330,13 @@ def insert_pg(df, tableName, engine): if_exists='replace', index=False ) + + comment_queries = [] + for i, col_name in enumerate(new_columns): + col_comment = original_columns[i].replace("'", "''") + comment_queries.append(f"COMMENT ON COLUMN \"{tableName}\".\"{col_name}\" IS '{col_comment}'") + for query in comment_queries: + cursor.execute(query) # trans csv output = StringIO() df.to_csv(output, sep='\t', header=False, index=False) diff --git a/backend/apps/db/engine.py b/backend/apps/db/engine.py index f4c95274..bafc085d 100644 --- a/backend/apps/db/engine.py +++ b/backend/apps/db/engine.py @@ -39,7 +39,16 @@ def get_data_engine(): def create_table(session, table_name: str, fields: List[any]): # field type relation list = [] - for f in fields: + comment_list = [] + + # 生成字母序列 + def get_column_name(index): + if index < 26: + return chr(ord('A') + index) + else: + return chr(ord('A') + index // 26 - 1) + chr(ord('A') + index % 26) + + for i, f in enumerate(fields): if "object" in f["type"]: f["relType"] = "text" elif "int" in f["type"]: @@ -50,14 +59,24 @@ def create_table(session, table_name: str, fields: List[any]): f["relType"] = "timestamp" else: f["relType"] = "text" - list.append(f'"{f["name"]}" {f["relType"]}') + # 使用字母作为列名 + column_name = get_column_name(i) + list.append(f'"{column_name}" {f["relType"]}') + + # 保存原始列名作为注释 + comment_list.append(f'COMMENT ON COLUMN "{table_name}"."{column_name}" IS \'{f["name"]}\'') sql = f""" CREATE TABLE "{table_name}" ( {", ".join(list)} ); """ + # 添加注释语句 + comment_sql = ";\n".join(comment_list) + ";" + session.execute(text(sql)) + session.execute(text(comment_sql)) + print(sql+comment_sql) session.commit() From 5a681e99115fd588ee3efe031b193a0026013163 Mon Sep 17 00:00:00 2001 From: martinma-2000 Date: Thu, 18 Sep 2025 16:17:18 +0800 Subject: [PATCH 06/12] =?UTF-8?q?1=E3=80=81=E6=9B=B4=E6=8D=A2=E7=AB=AF?= =?UTF-8?q?=E5=8F=A38010=202=E3=80=81=E5=88=97=E5=90=8D=E6=9B=B4=E6=94=B9?= =?UTF-8?q?=EF=BC=8C=E5=8E=9F=E5=88=97=E5=90=8D=E5=8F=98=E6=88=90comment?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 +- backend/main.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 98e00c88..a644c41f 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ docker run -d \ ### 访问方式 -- 在浏览器中打开: http://<你的服务器IP>:8000/ +- 在浏览器中打开: http://<你的服务器IP>:8010/ - 用户名: admin - 密码: SQLBot@123456 diff --git a/backend/main.py b/backend/main.py index 8912ef1e..2cd526f9 100644 --- a/backend/main.py +++ b/backend/main.py @@ -97,5 +97,5 @@ def custom_generate_unique_id(route: APIRoute) -> str: if __name__ == "__main__": import uvicorn - uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True) + uvicorn.run("main:app", host="0.0.0.0", port=8010) # uvicorn.run("main:mcp_app", host="0.0.0.0", port=8001) # mcp server From 8b3fb4f06196b156db2023c6cc25b9a753fcf966 Mon Sep 17 00:00:00 2001 From: martinma-2000 Date: Fri, 19 Sep 2025 09:12:13 +0800 Subject: [PATCH 07/12] =?UTF-8?q?=E6=97=B6=E9=97=B4=E5=88=97=E5=85=83?= =?UTF-8?q?=E7=B4=A0=E6=A0=BC=E5=BC=8F=E8=BD=AC=E6=8D=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/test/excel_extract.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/backend/test/excel_extract.py b/backend/test/excel_extract.py index 8ec37a88..3b9e07d7 100644 --- a/backend/test/excel_extract.py +++ b/backend/test/excel_extract.py @@ -206,10 +206,21 @@ def convert_multi_to_single_header(self, file_path, header_rows=None, sheet_name return df + def parse_chinese_date(self,date_str): + try: + # 尝试按照 "YYYY年M月" 格式解析 + formatted_str = date_str.replace('年', '-').replace('月', '') + return pd.Period(formatted_str, freq='M') + except Exception as e: + print(f"无法解析日期字符串: {date_str}, 错误: {e}") + return pd.NaT # 返回 Not a Time 表示无效时间 + + + # 使用示例 if __name__ == "__main__": - excel_file = r"D:\文档-陕农信\测试文件示例\报表1 - 副本.xlsx" + excel_file = r"D:\文档-陕农信\测试文件示例\27000099_202509_元_银行卡业务月度运营报表.xlsx" try: # 创建处理器实例 @@ -221,7 +232,8 @@ def convert_multi_to_single_header(self, file_path, header_rows=None, sheet_name df = processor.convert_multi_to_single_header(excel_file, header_rows=None) if len(df) > 2: df = df.iloc[:-2] # 删除最后两行 - df['表格日期'] = _time + + df['表格日期'] = processor.parse_chinese_date(_time) for i in df.values: print(i) From 53a96d40308284f2c8cac5d80bb5056c34998689 Mon Sep 17 00:00:00 2001 From: martinma-2000 Date: Fri, 19 Sep 2025 10:57:19 +0800 Subject: [PATCH 08/12] =?UTF-8?q?=E4=BC=98=E5=8C=96=E6=9C=80=E7=BB=88df?= =?UTF-8?q?=E8=BE=93=E5=87=BA=E7=9A=84=E6=A0=BC=E5=BC=8F=E5=86=85=E5=AE=B9?= =?UTF-8?q?=EF=BC=8C=E6=96=B0=E5=A2=9E=E6=97=B6=E9=97=B4=E5=88=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/test/excel_extract.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/backend/test/excel_extract.py b/backend/test/excel_extract.py index 3b9e07d7..66935708 100644 --- a/backend/test/excel_extract.py +++ b/backend/test/excel_extract.py @@ -1,5 +1,6 @@ import pandas as pd import re +import merge_diff_time class ExcelHeaderProcessor: @@ -203,6 +204,11 @@ def convert_multi_to_single_header(self, file_path, header_rows=None, sheet_name # 设置新的单级表头 df.columns = single_columns + + excel_name, _time = processor.get_name_time(file_path) + if len(df) > 2: + df = df.iloc[:-2] # 删除最后两行 + df['表格日期'] = processor.parse_chinese_date(_time) return df From 07c4ada66df3463089b354f9a95012d459b1aa83 Mon Sep 17 00:00:00 2001 From: martinma-2000 Date: Fri, 19 Sep 2025 10:59:46 +0800 Subject: [PATCH 09/12] =?UTF-8?q?=E7=BA=B5=E5=90=91=E5=90=88=E5=B9=B6?= =?UTF-8?q?=E6=96=B9=E6=B3=95=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/test/merge_diff_time.py | 41 +++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 backend/test/merge_diff_time.py diff --git a/backend/test/merge_diff_time.py b/backend/test/merge_diff_time.py new file mode 100644 index 00000000..8a49a700 --- /dev/null +++ b/backend/test/merge_diff_time.py @@ -0,0 +1,41 @@ +""" +用法:多张表的格式相同,但是时间不同,将多张表纵向合并 +""" +import pandas as pd +from typing import List +def concatenate_dataframes(dataframes: List[pd.DataFrame],primary_key_col: int = 0) ->pd.DataFrame: + + if not dataframes: + raise ValueError("输入的DataFrame列表不能为空") + # 2.统一列名校验 + reference_columns = list(dataframes[0].columns) + for i, df in enumerate(dataframes[1:],1): + if list(df.columns) != reference_columns: + raise ValueError(f"第{i + 1}个Datarrame的列名每第一个不一致\n" + f"参考列名:{reference_columns}\n" + f"实际列名:{list(df.columns)}") + + if len(dataframes) > 1: + # 获取所有主键值(去重后) + primary_keys = set() + for df in dataframes: + # 检查主键列是否存在 + if primary_key_col >= len(df.columns): + raise ValueError("主键列索引{primary key col}超出第{len(dataframes)}个DataFrame的列范围") + + current_keys = set(df.iloc[:, primary_key_col].astype(str)) + if not primary_keys: + primary_keys = current_keys # 否则检查是否一致 + elif current_keys != primary_keys: + diff = primary_keys.symmetric_difference(current_keys) + raise ValueError(f"主键值不一致,差异值:{diff}\n" + f"请确保所有DataFrame的第{primary_key_col+1}列主键值完全相同") + try: + result = pd.concat(dataframes, axis=0, ignore_index = True) + except Exception as e: + raise ValueError(f"拼接过程中发生错误:{str(e)}") + return result + + +if __name__ == "__main__": + excel_file = r"D:\文档-陕农信\测试文件示例\27000099_202509_元_银行卡业务月度运营报表.xlsx" From 7856b7081c620b503cdfc0f573f47630ba56ba0b Mon Sep 17 00:00:00 2001 From: martinma-2000 Date: Fri, 19 Sep 2025 16:51:08 +0800 Subject: [PATCH 10/12] =?UTF-8?q?=E5=86=85=E9=83=A8=E4=BD=BF=E7=94=A8self?= =?UTF-8?q?=E8=B0=83=E7=94=A8=E6=96=B9=E6=B3=95=EF=BC=8C=E5=BC=95=E5=85=A5?= =?UTF-8?q?=E5=90=88=E5=B9=B6=E6=96=B9=E6=B3=95=E8=B7=AF=E5=BE=84=E5=85=A8?= =?UTF-8?q?=E9=87=8F=E5=90=88=E5=B9=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/test/excel_extract.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/test/excel_extract.py b/backend/test/excel_extract.py index 66935708..f3038bd8 100644 --- a/backend/test/excel_extract.py +++ b/backend/test/excel_extract.py @@ -1,6 +1,6 @@ import pandas as pd import re -import merge_diff_time +from test.merge_diff_time import concatenate_dataframes class ExcelHeaderProcessor: @@ -205,10 +205,10 @@ def convert_multi_to_single_header(self, file_path, header_rows=None, sheet_name # 设置新的单级表头 df.columns = single_columns - excel_name, _time = processor.get_name_time(file_path) + excel_name, _time = self.get_name_time(file_path) if len(df) > 2: df = df.iloc[:-2] # 删除最后两行 - df['表格日期'] = processor.parse_chinese_date(_time) + df['表格日期'] = self.parse_chinese_date(_time) return df From dd6bb37008d6cdbee7d0302847ded636ff26ffea Mon Sep 17 00:00:00 2001 From: martinma-2000 Date: Fri, 19 Sep 2025 17:52:41 +0800 Subject: [PATCH 11/12] =?UTF-8?q?=E5=8D=95=E8=A1=A8=E9=A2=84=E5=A4=84?= =?UTF-8?q?=E7=90=86&=E5=A4=9A=E8=A1=A8=E9=A2=84=E5=A4=84=E7=90=86+?= =?UTF-8?q?=E6=8B=BC=E6=8E=A5=E6=8E=A5=E5=8F=A3=E7=94=9F=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/apps/datasource/api/datasource.py | 115 ++++++++++++++++++++++ 1 file changed, 115 insertions(+) diff --git a/backend/apps/datasource/api/datasource.py b/backend/apps/datasource/api/datasource.py index 9ca07e32..9c25e5ab 100644 --- a/backend/apps/datasource/api/datasource.py +++ b/backend/apps/datasource/api/datasource.py @@ -301,6 +301,121 @@ def inner(): return await asyncio.to_thread(inner) +@router.post("/preprocessExcel", response_model=PreprocessResponse) +async def preprocess_excel( + file: UploadFile = File(...), + separator: str = Form("_") +): + """ + 预处理Excel文件,将多级表头转换为单级表头 + + 参数: + - file: 上传的Excel文件 + - separator: 连接符,默认为下划线 + + 返回: + - PreprocessResponse: 预处理后的文件信息 + """ + ALLOWED_EXTENSIONS = {"xlsx", "xls"} + if not file.filename.lower().endswith(tuple(ALLOWED_EXTENSIONS)): + raise HTTPException(400, "Only support .xlsx/.xls") + + os.makedirs(path, exist_ok=True) + filename = f"{file.filename.split('.')[0]}_preprocessed_{hashlib.sha256(uuid.uuid4().bytes).hexdigest()[:10]}.{file.filename.split('.')[1]}" + save_path = os.path.join(path, filename) + with open(save_path, "wb") as f: + f.write(await file.read()) + + try: + # 创建处理器实例 + processor = ExcelHeaderProcessor(separator=separator) + + # 处理Excel文件,转换多级表头为单级表头 + df = processor.convert_multi_to_single_header(save_path) + + # 保存处理后的文件 + processed_filename = save_path.replace('.' + file.filename.split('.')[-1], '_processed.xlsx') + df.to_excel(processed_filename, index=False) + + # 返回处理后的文件信息 + sheets = [{"tableName": "Sheet1", "tableComment": "Processed Sheet"}] + return PreprocessResponse(filename=os.path.basename(processed_filename), sheets=sheets) + + except Exception as e: + # 删除临时文件 + if os.path.exists(save_path): + os.remove(save_path) + raise HTTPException(500, f"预处理文件时出错: {str(e)}") + +from fastapi.responses import FileResponse +@router.post("/concatenateExcels") +async def concatenate_excels( + files: List[UploadFile] = File(...), + separator: str = Form("_"), + primary_key_col: int = Form(0) +): + """ + 拼接多个Excel文件 + + 参数: + - files: 上传的多个Excel文件 + - separator: 连接符,默认为下划线 + - primary_key_col: 主键列索引,默认为0 + + 返回: + - PreprocessResponse: 拼接后的文件信息 + """ + ALLOWED_EXTENSIONS = {"xlsx", "xls"} + + # 检查文件类型 + for file in files: + if not file.filename.lower().endswith(tuple(ALLOWED_EXTENSIONS)): + raise HTTPException(400, "Only support .xlsx/.xls") + + # 保存上传的文件 + file_paths = [] + for file in files: + filename = f"{file.filename.split('.')[0]}_{hashlib.sha256(uuid.uuid4().bytes).hexdigest()[:10]}.{file.filename.split('.')[1]}" + save_path = os.path.join(path, filename) + with open(save_path, "wb") as f: + f.write(await file.read()) + file_paths.append(save_path) + + try: + # 创建处理器实例 + processor = ExcelHeaderProcessor(separator=separator) + + # 预处理所有文件 + dataframes = [] + for file_path in file_paths: + df = processor.convert_multi_to_single_header(file_path) + dataframes.append(df) + + # 拼接所有DataFrame + result_df = concatenate_dataframes(dataframes, primary_key_col) + + # 保存拼接后的文件 + concatenated_filename = f"concatenated_{hashlib.sha256(uuid.uuid4().bytes).hexdigest()[:10]}.xlsx" + concatenated_path = os.path.join(path, concatenated_filename) + result_df.to_excel(concatenated_path, index=False) + + # 返回拼接后的文件信息 + # sheets = [{"tableName": "Sheet1", "tableComment": "Concatenated Sheet"}] + # return PreprocessResponse(filename=concatenated_filename, sheets=sheets) + # 返回文件供下载 + return FileResponse( + path=concatenated_path, + filename=concatenated_filename, + media_type='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' + ) + except Exception as e: + # 删除临时文件 + for file_path in file_paths: + if os.path.exists(file_path): + os.remove(file_path) + raise HTTPException(500, f"拼接文件时出错: {str(e)}") + + def insert_pg(df, tableName, engine): # fix field type for i in range(len(df.dtypes)): From ea879bc4c690c4d356ffa34009f22a9488d9b6c4 Mon Sep 17 00:00:00 2001 From: martinma-2000 Date: Fri, 19 Sep 2025 17:53:32 +0800 Subject: [PATCH 12/12] =?UTF-8?q?=E5=8D=95=E8=A1=A8=E9=A2=84=E5=A4=84?= =?UTF-8?q?=E7=90=86&=E5=A4=9A=E8=A1=A8=E9=A2=84=E5=A4=84=E7=90=86+?= =?UTF-8?q?=E6=8B=BC=E6=8E=A5=E6=8E=A5=E5=8F=A3=E7=94=9F=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/apps/datasource/api/datasource.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/backend/apps/datasource/api/datasource.py b/backend/apps/datasource/api/datasource.py index 9c25e5ab..097bbd66 100644 --- a/backend/apps/datasource/api/datasource.py +++ b/backend/apps/datasource/api/datasource.py @@ -8,7 +8,12 @@ import orjson import pandas as pd -from fastapi import APIRouter, File, UploadFile, HTTPException +from fastapi import APIRouter, File, UploadFile, HTTPException, Form +from pydantic import BaseModel + +from test.excel_extract import ExcelHeaderProcessor +from test.merge_diff_time import concatenate_dataframes + from apps.db.db import get_schema from apps.db.engine import get_engine_conn @@ -26,6 +31,18 @@ path = settings.EXCEL_PATH +class PreprocessResponse(BaseModel): + """预处理响应模型""" + filename: str + sheets: List[dict] + + +class ConcatenateRequest(BaseModel): + """拼接请求模型""" + file_paths: List[str] + sheet_names: List[str] = None + + @router.get("/ws/{oid}", include_in_schema=False) async def query_by_oid(session: SessionDep, user: CurrentUser, oid: int) -> List[CoreDatasource]: if not user.isAdmin: @@ -223,7 +240,7 @@ def inner(): # column_len = len(df.dtypes) # fields = [] # for i in range(column_len): -# # build fields +# # build fields # fields.append({"name": df.columns[i], "type": str(df.dtypes[i]), "relType": ""}) # # create table # create_table(conn, tableName, fields) @@ -468,4 +485,4 @@ def get_column_name(index): raise HTTPException(400, str(e)) finally: cursor.close() - conn.close() + conn.close() \ No newline at end of file