Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions backend/apps/chat/api/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from apps.chat.models.chat_model import CreateChat, ChatRecord, RenameChat, ChatQuestion, AxisObj
from apps.chat.task.llm import LLMService
from common.core.deps import CurrentAssistant, SessionDep, CurrentUser, Trans
from common.utils.data_format import DataFormat

router = APIRouter(tags=["Data Q&A"], prefix="/chat")

Expand Down Expand Up @@ -245,9 +246,9 @@ async def export_excel(session: SessionDep, chat_record_id: int, trans: Trans):

def inner():

data_list = LLMService.convert_large_numbers_in_object_array(_data + _predict_data)
data_list = DataFormat.convert_large_numbers_in_object_array(_data + _predict_data)

md_data, _fields_list = LLMService.convert_object_array_for_pandas(fields, data_list)
md_data, _fields_list = DataFormat.convert_object_array_for_pandas(fields, data_list)

# data, _fields_list, col_formats = LLMService.format_pd_data(fields, _data + _predict_data)

Expand Down
124 changes: 6 additions & 118 deletions backend/apps/chat/task/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
from common.core.db import engine
from common.core.deps import CurrentAssistant, CurrentUser
from common.error import SingleMessageError, SQLBotDBError, ParseSQLResultError, SQLBotDBConnectionError
from common.utils.data_format import DataFormat
from common.utils.utils import SQLBotLogUtil, extract_nested_json, prepare_for_orjson

warnings.filterwarnings("ignore")
Expand Down Expand Up @@ -1039,7 +1040,7 @@ def run_task(self, in_chat: bool = True, stream: bool = True,

result = self.execute_sql(sql=real_execute_sql)

_data = self.convert_large_numbers_in_object_array(result.get('data'))
_data = DataFormat.convert_large_numbers_in_object_array(result.get('data'))
result["data"] = _data

self.save_sql_data(session=_session, data_obj=result)
Expand All @@ -1057,15 +1058,15 @@ def run_task(self, in_chat: bool = True, stream: bool = True,
for field in result.get('fields'):
_column_list.append(AxisObj(name=field, value=field))

md_data, _fields_list = self.convert_object_array_for_pandas(_column_list, result.get('data'))
md_data, _fields_list = DataFormat.convert_object_array_for_pandas(_column_list, result.get('data'))

# data, _fields_list, col_formats = self.format_pd_data(_column_list, result.get('data'))

if not _data or not _fields_list:
yield 'The SQL execution result is empty.\n\n'
else:
df = pd.DataFrame(_data, columns=_fields_list)
df_safe = self.safe_convert_to_string(df)
df_safe = DataFormat.safe_convert_to_string(df)
markdown_table = df_safe.to_markdown(index=False)
yield markdown_table + '\n\n'
else:
Expand Down Expand Up @@ -1115,15 +1116,15 @@ def run_task(self, in_chat: bool = True, stream: bool = True,
_column_list.append(
AxisObj(name=field if not _fields.get(field) else _fields.get(field), value=field))

md_data, _fields_list = self.convert_object_array_for_pandas(_column_list, result.get('data'))
md_data, _fields_list = DataFormat.convert_object_array_for_pandas(_column_list, result.get('data'))

# data, _fields_list, col_formats = self.format_pd_data(_column_list, result.get('data'))

if not md_data or not _fields_list:
yield 'The SQL execution result is empty.\n\n'
else:
df = pd.DataFrame(md_data, columns=_fields_list)
df_safe = self.safe_convert_to_string(df)
df_safe = DataFormat.safe_convert_to_string(df)
markdown_table = df_safe.to_markdown(index=False)
yield markdown_table + '\n\n'

Expand Down Expand Up @@ -1176,120 +1177,7 @@ def run_task(self, in_chat: bool = True, stream: bool = True,
self.finish(_session)
session_maker.remove()

@staticmethod
def safe_convert_to_string(df):
df_copy = df.copy()

def format_value(x):
if pd.isna(x):
return ""

return "\u200b" + str(x)

for col in df_copy.columns:
df_copy[col] = df_copy[col].apply(format_value)

return df_copy

@staticmethod
def convert_large_numbers_in_object_array(obj_array, int_threshold=1e15, float_threshold=1e10):
"""处理对象数组,将每个对象中的大数字转换为字符串"""

def format_float_without_scientific(value):
"""格式化浮点数,避免科学记数法"""
if value == 0:
return "0"
formatted = f"{value:.15f}"
if '.' in formatted:
formatted = formatted.rstrip('0').rstrip('.')
return formatted

def process_object(obj):
"""处理单个对象"""
if not isinstance(obj, dict):
return obj

processed_obj = {}
for key, value in obj.items():
if isinstance(value, (int, float)):
# 只转换大数字
if isinstance(value, int) and abs(value) >= int_threshold:
processed_obj[key] = str(value)
elif isinstance(value, float) and (abs(value) >= float_threshold or abs(value) < 1e-6):
processed_obj[key] = format_float_without_scientific(value)
else:
processed_obj[key] = value
elif isinstance(value, dict):
# 处理嵌套对象
processed_obj[key] = process_object(value)
elif isinstance(value, list):
# 处理对象中的数组
processed_obj[key] = [process_item(item) for item in value]
else:
processed_obj[key] = value
return processed_obj

def process_item(item):
"""处理数组中的项目"""
if isinstance(item, dict):
return process_object(item)
return item

return [process_item(obj) for obj in obj_array]

@staticmethod
def convert_object_array_for_pandas(column_list: list, data_list: list):
_fields_list = []
for field_idx, field in enumerate(column_list):
_fields_list.append(field.name)

md_data = []
for inner_data in data_list:
_row = []
for field_idx, field in enumerate(column_list):
value = inner_data.get(field.value)
_row.append(value)
md_data.append(_row)
return md_data, _fields_list

@staticmethod
def format_pd_data(column_list: list, data_list: list, col_formats: dict = None):
# 预处理数据并记录每列的格式类型
# 格式类型:'text'(文本)、'number'(数字)、'default'(默认)
_fields_list = []

if col_formats is None:
col_formats = {}
for field_idx, field in enumerate(column_list):
_fields_list.append(field.name)
col_formats[field_idx] = 'default' # 默认不特殊处理

data = []

for _data in data_list:
_row = []
for field_idx, field in enumerate(column_list):
value = _data.get(field.value)
if value is not None:
# 检查是否为数字且需要特殊处理
if isinstance(value, (int, float)):
# 整数且超过15位 → 转字符串并标记为文本列
if isinstance(value, int) and len(str(abs(value))) > 15:
value = str(value)
col_formats[field_idx] = 'text'
# 小数且超过15位有效数字 → 转字符串并标记为文本列
elif isinstance(value, float):
decimal_str = format(value, '.16f').rstrip('0').rstrip('.')
if len(decimal_str) > 15:
value = str(value)
col_formats[field_idx] = 'text'
# 其他数字列标记为数字格式(避免科学记数法)
elif col_formats[field_idx] != 'text':
col_formats[field_idx] = 'number'
_row.append(value)
data.append(_row)

return data, _fields_list, col_formats

def run_recommend_questions_task_async(self):
self.future = executor.submit(self.run_recommend_questions_task_cache)
Expand Down
8 changes: 4 additions & 4 deletions backend/apps/data_training/api/data_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
from fastapi.responses import StreamingResponse

from apps.chat.models.chat_model import AxisObj
from apps.chat.task.llm import LLMService
from apps.data_training.curd.data_training import page_data_training, create_training, update_training, delete_training, \
enable_training, get_all_data_training
from apps.data_training.models.data_training_model import DataTrainingInfo
from common.core.deps import SessionDep, CurrentUser, Trans
from common.utils.data_format import DataFormat

router = APIRouter(tags=["DataTraining"], prefix="/system/data-training")

Expand Down Expand Up @@ -53,9 +53,9 @@ async def enable(session: SessionDep, id: int, enabled: bool, trans: Trans):

@router.get("/export")
async def export_excel(session: SessionDep, trans: Trans, current_user: CurrentUser,
word: Optional[str] = Query(None, description="搜索术语(可选)")):
question: Optional[str] = Query(None, description="搜索术语(可选)")):
def inner():
_list = get_all_data_training(session, word, oid=current_user.oid)
_list = get_all_data_training(session, question, oid=current_user.oid)

data_list = []
for obj in _list:
Expand All @@ -75,7 +75,7 @@ def inner():
fields.append(
AxisObj(name=trans('i18n_data_training.advanced_application'), value='advanced_application_name'))

md_data, _fields_list = LLMService.convert_object_array_for_pandas(fields, data_list)
md_data, _fields_list = DataFormat.convert_object_array_for_pandas(fields, data_list)

df = pd.DataFrame(md_data, columns=_fields_list)

Expand Down
8 changes: 4 additions & 4 deletions backend/apps/terminology/api/terminology.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
from fastapi.responses import StreamingResponse

from apps.chat.models.chat_model import AxisObj
from apps.chat.task.llm import LLMService
from apps.terminology.curd.terminology import page_terminology, create_terminology, update_terminology, \
delete_terminology, enable_terminology, get_all_terminology
from apps.terminology.models.terminology_model import TerminologyInfo
from common.core.deps import SessionDep, CurrentUser, Trans
from common.utils.data_format import DataFormat

router = APIRouter(tags=["Terminology"], prefix="/system/terminology")

Expand Down Expand Up @@ -62,8 +62,8 @@ def inner():
"word": obj.word,
"other_words": ', '.join(obj.other_words) if obj.other_words else '',
"description": obj.description,
"all_data_sources": 'Y' if obj.specific_ds else 'N',
"datasource": ', '.join(obj.datasource_names) if obj.datasource_names else '',
"all_data_sources": 'N' if obj.specific_ds else 'Y',
"datasource": ', '.join(obj.datasource_names) if obj.datasource_names and obj.specific_ds else '',
}
data_list.append(_data)

Expand All @@ -74,7 +74,7 @@ def inner():
fields.append(AxisObj(name=trans('i18n_terminology.effective_data_sources'), value='datasource'))
fields.append(AxisObj(name=trans('i18n_terminology.all_data_sources'), value='all_data_sources'))

md_data, _fields_list = LLMService.convert_object_array_for_pandas(fields, data_list)
md_data, _fields_list = DataFormat.convert_object_array_for_pandas(fields, data_list)

df = pd.DataFrame(md_data, columns=_fields_list)

Expand Down
117 changes: 117 additions & 0 deletions backend/common/utils/data_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import pandas as pd

class DataFormat:
@staticmethod
def safe_convert_to_string(df):
df_copy = df.copy()

def format_value(x):
if pd.isna(x):
return ""

return "\u200b" + str(x)

for col in df_copy.columns:
df_copy[col] = df_copy[col].apply(format_value)

return df_copy

@staticmethod
def convert_large_numbers_in_object_array(obj_array, int_threshold=1e15, float_threshold=1e10):
"""处理对象数组,将每个对象中的大数字转换为字符串"""

def format_float_without_scientific(value):
"""格式化浮点数,避免科学记数法"""
if value == 0:
return "0"
formatted = f"{value:.15f}"
if '.' in formatted:
formatted = formatted.rstrip('0').rstrip('.')
return formatted

def process_object(obj):
"""处理单个对象"""
if not isinstance(obj, dict):
return obj

processed_obj = {}
for key, value in obj.items():
if isinstance(value, (int, float)):
# 只转换大数字
if isinstance(value, int) and abs(value) >= int_threshold:
processed_obj[key] = str(value)
elif isinstance(value, float) and (abs(value) >= float_threshold or abs(value) < 1e-6):
processed_obj[key] = format_float_without_scientific(value)
else:
processed_obj[key] = value
elif isinstance(value, dict):
# 处理嵌套对象
processed_obj[key] = process_object(value)
elif isinstance(value, list):
# 处理对象中的数组
processed_obj[key] = [process_item(item) for item in value]
else:
processed_obj[key] = value
return processed_obj

def process_item(item):
"""处理数组中的项目"""
if isinstance(item, dict):
return process_object(item)
return item

return [process_item(obj) for obj in obj_array]

@staticmethod
def convert_object_array_for_pandas(column_list: list, data_list: list):
_fields_list = []
for field_idx, field in enumerate(column_list):
_fields_list.append(field.name)

md_data = []
for inner_data in data_list:
_row = []
for field_idx, field in enumerate(column_list):
value = inner_data.get(field.value)
_row.append(value)
md_data.append(_row)
return md_data, _fields_list

@staticmethod
def format_pd_data(column_list: list, data_list: list, col_formats: dict = None):
# 预处理数据并记录每列的格式类型
# 格式类型:'text'(文本)、'number'(数字)、'default'(默认)
_fields_list = []

if col_formats is None:
col_formats = {}
for field_idx, field in enumerate(column_list):
_fields_list.append(field.name)
col_formats[field_idx] = 'default' # 默认不特殊处理

data = []

for _data in data_list:
_row = []
for field_idx, field in enumerate(column_list):
value = _data.get(field.value)
if value is not None:
# 检查是否为数字且需要特殊处理
if isinstance(value, (int, float)):
# 整数且超过15位 → 转字符串并标记为文本列
if isinstance(value, int) and len(str(abs(value))) > 15:
value = str(value)
col_formats[field_idx] = 'text'
# 小数且超过15位有效数字 → 转字符串并标记为文本列
elif isinstance(value, float):
decimal_str = format(value, '.16f').rstrip('0').rstrip('.')
if len(decimal_str) > 15:
value = str(value)
col_formats[field_idx] = 'text'
# 其他数字列标记为数字格式(避免科学记数法)
elif col_formats[field_idx] != 'text':
col_formats[field_idx] = 'number'
_row.append(value)
data.append(_row)

return data, _fields_list, col_formats
6 changes: 5 additions & 1 deletion backend/locales/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,11 @@
},
"i18n_custom_prompt": {
"exists_in_db": "Template name already exists",
"not_exists": "This template does not exist"
"not_exists": "This template does not exist",
"prompt_word_name": "Prompt word name",
"prompt_word_content": "Prompt word content",
"effective_data_sources": "Effective Data Sources",
"all_data_sources": "All Data Sources"
},
"i18n_excel_export": {
"data_is_empty": "Form data is empty, unable to export data"
Expand Down
Loading