fix(Hive): normalize identifier quoting and stabilize metadata/query paths

yym · ulleo · commit eb587fce7bec · 2026-05-08T11:53:08.000+08:00
Use Hive-compatible identifier handling and template defaults so generated SQL returns real column values, while also fixing table schema parsing and adding required Hive runtime dependencies.

Made-with: Cursor
diff --git a/backend/apps/datasource/models/datasource.py b/backend/apps/datasource/models/datasource.py
@@ -143,7 +143,7 @@ def to_dict(self):
 
 
 class TableSchema:
-    def __init__(self, attr1, attr2):
+    def __init__(self, attr1, attr2=None):
         self.tableName = attr1
         self.tableComment = attr2 if attr2 is None or isinstance(attr2, str) else attr2.decode("utf-8")
 
diff --git a/backend/apps/db/constant.py b/backend/apps/db/constant.py
@@ -29,7 +29,7 @@ class DB(Enum):
     pg = ('pg', 'PostgreSQL', '"', '"', ConnectType.sqlalchemy, 'PostgreSQL', [])
     starrocks = ('starrocks', 'StarRocks', '`', '`', ConnectType.py_driver, 'StarRocks', [])
     sqlite = ('sqlite', 'SQLite', '"', '"', ConnectType.sqlalchemy, 'SQLite', [])
-    hive = ('hive', 'Apache Hive', '"', '"', ConnectType.py_driver, 'Hive', [])
+    hive = ('hive', 'Apache Hive', '`', '`', ConnectType.py_driver, 'Hive', [])
 
     def __init__(self, type, db_name, prefix, suffix, connect_type: ConnectType, template_name: str,
                  illegalParams: List[str]):
diff --git a/backend/apps/db/db.py b/backend/apps/db/db.py
@@ -2,6 +2,7 @@
 import json
 import os
 import platform
+import re
 import urllib.parse
 from datetime import datetime, date, time, timedelta
 from decimal import Decimal
@@ -35,12 +36,8 @@
 import sqlglot
 from sqlglot import expressions as exp
 from sqlalchemy.pool import NullPool
+from pyhive import hive
 
-try:
-    from pyhive import hive
-    PYHIVE_AVAILABLE = True
-except ImportError:
-    PYHIVE_AVAILABLE = False
 
 try:
     if os.path.exists(settings.ORACLE_CLIENT_PATH):
@@ -259,25 +256,22 @@ def check_connection(trans: Optional[Trans], ds: CoreDatasource | AssistantOutDs
                         raise HTTPException(status_code=500, detail=trans('i18n_ds_invalid') + f': {e.args}')
                     return False
         elif equals_ignore_case(ds.type, 'hive'):
-            if PYHIVE_AVAILABLE:
-                try:
-                    conn = hive.connect(host=conf.host, port=conf.port, username=conf.username,
-                                       database=conf.database, **extra_config_dict)
-                    cursor = conn.cursor()
-                    cursor.execute('select 1')
-                    cursor.fetchall()
-                    cursor.close()
-                    conn.close()
-                    SQLBotLogUtil.info("success")
-                    return True
-                except Exception as e:
-                    SQLBotLogUtil.error(f"Datasource {ds.id} connection failed: {e}")
-                    if is_raise:
-                        raise HTTPException(status_code=500, detail=trans('i18n_ds_invalid') + f': {e.args}')
-                    return False
-            else:
-                SQLBotLogUtil.error("pyhive not installed")
+            try:
+                conn = hive.connect(host=conf.host, port=conf.port, username=conf.username,
+                                    database=conf.database, **extra_config_dict)
+                cursor = conn.cursor()
+                cursor.execute('select 1')
+                cursor.fetchall()
+                cursor.close()
+                conn.close()
+                SQLBotLogUtil.info("success")
+                return True
+            except Exception as e:
+                SQLBotLogUtil.error(f"Datasource {ds.id} connection failed: {e}")
+                if is_raise:
+                    raise HTTPException(status_code=500, detail=trans('i18n_ds_invalid') + f': {e.args}')
                 return False
+        
         elif equals_ignore_case(ds.type, 'es'):
             es_conn = get_es_connect(conf)
             if es_conn.ping():
@@ -403,6 +397,30 @@ def get_schema(ds: CoreDatasource):
                 res = cursor.fetchall()
                 res_list = [item[0] for item in res]
                 return res_list
+        elif equals_ignore_case(ds.type, 'hive'):
+            conn = hive.connect(host=conf.host, port=conf.port, username=conf.username,
+                                database=conf.database, **extra_config_dict)
+            cursor = conn.cursor()
+            cursor.execute('SHOW DATABASES')
+            res = cursor.fetchall()
+            res_list = [item[0] for item in res]
+            cursor.close()
+            conn.close()
+            return res_list
+        elif equals_ignore_case(ds.type, 'doris', 'starrocks'):
+            with pymysql.connect(user=conf.username, passwd=conf.password, host=conf.host,
+                                 port=conf.port, db=conf.database, connect_timeout=10,
+                                 read_timeout=10, **extra_config_dict) as conn, conn.cursor() as cursor:
+                cursor.execute('SHOW DATABASES')
+                res = cursor.fetchall()
+                res_list = [item[0] for item in res]
+                return res_list
+        elif equals_ignore_case(ds.type, 'ck'):
+            with get_session(ds) as session:
+                with session.execute(text('SHOW DATABASES')) as result:
+                    res = result.fetchall()
+                    res_list = [item[0] for item in res]
+                    return res_list
 
 
 def get_tables(ds: CoreDatasource):
@@ -465,17 +483,15 @@ def get_tables(ds: CoreDatasource):
             res_list = [TableSchema(*item) for item in res]
             return res_list
         elif equals_ignore_case(ds.type, 'hive'):
-            if PYHIVE_AVAILABLE:
-                conn = hive.connect(host=conf.host, port=conf.port, username=conf.username,
-                                   database=conf.database, **extra_config_dict)
-                cursor = conn.cursor()
-                cursor.execute(sql)
-                res = cursor.fetchall()
-                res_list = [TableSchema(*item) for item in res]
-                cursor.close()
-                conn.close()
-                return res_list
-            return []
+            conn = hive.connect(host=conf.host, port=conf.port, username=conf.username,
+                                database=conf.database, **extra_config_dict)
+            cursor = conn.cursor()
+            cursor.execute(sql)
+            res = cursor.fetchall()
+            res_list = [TableSchema(*item) for item in res]
+            cursor.close()
+            conn.close()
+            return res_list
 
 
 def get_fields(ds: CoreDatasource, table_name: str = None):
@@ -538,17 +554,15 @@ def get_fields(ds: CoreDatasource, table_name: str = None):
             res_list = [ColumnSchema(*item) for item in res]
             return res_list
         elif equals_ignore_case(ds.type, 'hive'):
-            if PYHIVE_AVAILABLE:
-                conn = hive.connect(host=conf.host, port=conf.port, username=conf.username,
-                                   database=conf.database, **extra_config_dict)
-                cursor = conn.cursor()
-                cursor.execute(sql)
-                res = cursor.fetchall()
-                res_list = [ColumnSchema(*item) for item in res]
-                cursor.close()
-                conn.close()
-                return res_list
-            return []
+            conn = hive.connect(host=conf.host, port=conf.port, username=conf.username,
+                                database=conf.database, **extra_config_dict)
+            cursor = conn.cursor()
+            cursor.execute(sql)
+            res = cursor.fetchall()
+            res_list = [ColumnSchema(*item) for item in res]
+            cursor.close()
+            conn.close()
+            return res_list
 
 
 def convert_value(value, datetime_format='space'):
@@ -737,37 +751,53 @@ def exec_sql(ds: CoreDatasource | AssistantOutDsSchema, sql: str, origin_column=
             except Exception as ex:
                 raise Exception(str(ex))
         elif equals_ignore_case(ds.type, 'hive'):
-            if PYHIVE_AVAILABLE:
-                conn = hive.connect(host=conf.host, port=conf.port, username=conf.username,
-                                   database=conf.database, **extra_config_dict)
-                cursor = conn.cursor()
-                try:
-                    cursor.execute(sql)
-                    res = cursor.fetchall()
-                    columns = [field[0] for field in cursor.description] if origin_column else [field[0].lower() for
-                                                                                                field in
-                                                                                                cursor.description]
-                    result_list = [
-                        {str(columns[i]): convert_value(value) for i, value in enumerate(tuple_item)} for tuple_item in
-                        res
-                    ]
-                    return {"fields": columns, "data": result_list,
-                            "sql": bytes.decode(base64.b64encode(bytes(sql, 'utf-8')))}
-                except Exception as ex:
-                    raise ParseSQLResultError(str(ex))
-                finally:
-                    cursor.close()
-                    conn.close()
-            raise Exception("pyhive not installed")
+            conn = hive.connect(host=conf.host, port=conf.port, username=conf.username,
+                                database=conf.database, **extra_config_dict)
+            cursor = conn.cursor()
+            try:
+                # Hive uses backticks for identifiers; normalize quoted identifiers as a compatibility fallback.
+                hive_sql = re.sub(r'"([A-Za-z_][A-Za-z0-9_]*)"', r'`\1`', sql)
+                cursor.execute(hive_sql)
+                res = cursor.fetchall()
+                columns = [field[0] for field in cursor.description] if origin_column else [field[0].lower() for
+                                                                                            field in
+                                                                                            cursor.description]
+                result_list = [
+                    {str(columns[i]): convert_value(value) for i, value in enumerate(tuple_item)} for tuple_item in
+                    res
+                ]
+                return {"fields": columns, "data": result_list,
+                        "sql": bytes.decode(base64.b64encode(bytes(hive_sql, 'utf-8')))}
+            except Exception as ex:
+                raise ParseSQLResultError(str(ex))
+            finally:
+                cursor.close()
+                conn.close()
 
 
 def check_sql_read(sql: str, ds: CoreDatasource | AssistantOutDsSchema):
     try:
+        normalized_sql = sql.strip().lstrip("(").strip()
+        first_keyword = normalized_sql.split(None, 1)[0].upper() if normalized_sql else ""
+        allowed_read_commands = {"SELECT", "WITH", "SHOW", "DESCRIBE", "DESC", "EXPLAIN"}
+        denied_write_commands = {
+            "INSERT", "UPDATE", "DELETE", "CREATE", "DROP", "ALTER",
+            "TRUNCATE", "MERGE", "COPY", "REPLACE", "GRANT", "REVOKE",
+            "USE", "SET", "CALL"
+        }
+
+        if not first_keyword:
+            raise ValueError("Parse SQL Error")
+        if first_keyword in denied_write_commands:
+            return False
+
         dialect = None
         if equals_ignore_case(ds.type, 'mysql', 'doris', 'starrocks'):
             dialect = 'mysql'
         elif equals_ignore_case(ds.type, 'sqlServer'):
             dialect = 'tsql'
+        elif equals_ignore_case(ds.type, 'hive'):
+            dialect = 'hive'
 
         statements = sqlglot.parse(sql, dialect=dialect)
 
@@ -777,7 +807,7 @@ def check_sql_read(sql: str, ds: CoreDatasource | AssistantOutDsSchema):
         write_types = (
             exp.Insert, exp.Update, exp.Delete,
             exp.Create, exp.Drop, exp.Alter,
-            exp.Merge, exp.Command, exp.Copy
+            exp.Merge, exp.Copy
         )
 
         for stmt in statements:
@@ -786,7 +816,7 @@ def check_sql_read(sql: str, ds: CoreDatasource | AssistantOutDsSchema):
             if isinstance(stmt, write_types):
                 return False
 
-        return True
+        return first_keyword in allowed_read_commands
 
     except Exception as e:
         raise ValueError(f"Parse SQL Error: {e}")
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
@@ -53,7 +53,9 @@ dependencies = [
     "elasticsearch[requests] (>=7.10,<8.0)",
     "ldap3>=2.9.1",
     "sqlglot>=28.6.0",
-    "numpy==2.3.5"
+    "numpy==2.3.5",
+    "pyhive[hive]>=0.7.0",
+    "thrift-sasl"
 ]
 
 [project.optional-dependencies]
diff --git a/backend/templates/sql_examples/Hive.yaml b/backend/templates/sql_examples/Hive.yaml
@@ -0,0 +1,86 @@
+template:
+  quot_rule: |
+    <rule>
+      必须对数据库名、表名、字段名、别名外层加反引号（`）。
+      <note>
+        1. 点号（.）不能包含在引号内，必须写成 `database`.`table`
+        2. 即使标识符不含特殊字符或非关键字，也需强制加反引号
+      </note>
+    </rule>
+
+  limit_rule: |
+    <rule>
+      当需要限制行数时，必须使用标准的LIMIT语法
+    </rule>
+
+  other_rule: |
+    <rule>必须为每个表生成别名（不加AS）</rule>
+    {multi_table_condition}
+    <rule>禁止使用星号(*)，必须明确字段名</rule>
+    <rule>中文/特殊字符字段需保留原名并添加英文别名</rule>
+    <rule>不能用 + 拼接字符串，字符串必须使用单引号</rule>
+    <rule>分组非常严格：SELECT 里的字段必须出现在 GROUP BY 里，或者是聚合函数</rule>
+    <rule>函数字段必须加别名</rule>
+    <rule>百分比字段保留两位小数并以%结尾</rule>
+    <rule>WHERE 条件中不能使用 >、<、>=、<= 等比较运算符，必须使用 =</rule>
+    <rule>HIVE 中没有 NOT IN 操作符，必须使用 LEFT JOIN 或 EXISTS 替代</rule>
+    <rule>判空使用 NVL()函数</rule>
+    <rule>避免与数据库关键字冲突</rule>
+
+  basic_example: |
+    <basic-examples>
+      <intro>
+        📌 以下示例严格遵循<Rules>中的 Hive 规范，展示符合要求的 SQL 写法与典型错误案例。
+        ⚠️ 注意：示例中的表名、字段名均为演示虚构，实际使用时需替换为用户提供的真实标识符。
+        🔍 重点观察：
+          1. 反引号包裹所有数据库对象的规范用法
+          2. 中英别名/百分比/函数等特殊字段的处理
+          3. 关键字冲突的规避方式
+      </intro>
+      <example>
+        <input>查询 ods.orders 表的前100条订单（含中文字段和百分比）</input>
+        <output-bad>
+          SELECT * FROM ods.orders LIMIT 100  -- 错误：未加引号、使用星号
+          SELECT `订单ID`, `金额` FROM `ods`.`orders` `t1` LIMIT 100  -- 错误：缺少英文别名
+          SELECT COUNT(`订单ID`) FROM `ods`.`orders` `t1`  -- 错误：函数未加别名
+        </output-bad>
+        <output-good>
+          SELECT
+            `t1`.`订单ID` AS `order_id`,
+            `t1`.`金额` AS `amount`,
+            COUNT(`t1`.`订单ID`) AS `total_orders`,
+            CONCAT(CAST(ROUND(`t1`.`折扣率` * 100, 2) AS STRING), '%') AS `discount_percent`
+          FROM `ods`.`orders` `t1`
+          LIMIT 100
+        </output-good>
+      </example>
+
+      <example>
+        <input>统计 dim.users（含关键字字段user）的活跃占比</input>
+        <output-bad>
+          SELECT user, status FROM dim.users  -- 错误：未处理关键字和引号
+          SELECT `user`, ROUND(active_ratio) FROM `dim`.`users`  -- 错误：百分比格式错误
+        </output-bad>
+        <output-good>
+          SELECT
+            `u`.`user` AS `username`,
+            CONCAT(CAST(ROUND(`u`.`active_ratio` * 100, 2) AS STRING), '%') AS `active_percent`
+          FROM `dim`.`users` `u`
+          WHERE `u`.`status` = 1
+        </output-good>
+      </example>
+    </basic-examples>
+
+  example_engine: Apache Hive 2.X
+  example_answer_1: |
+    {"success":true,"sql":"SELECT `country` AS `country_name`, `continent` AS `continent_name`, `year` AS `year`, `gdp` AS `gdp` FROM `Sample_Database`.`sample_country_gdp` ORDER BY `country`, `year`","tables":["sample_country_gdp"],"chart-type":"line"}
+  example_answer_1_with_limit: |
+    {"success":true,"sql":"SELECT `country` AS `country_name`, `continent` AS `continent_name`, `year` AS `year`, `gdp` AS `gdp` FROM `Sample_Database`.`sample_country_gdp` ORDER BY `country`, `year` LIMIT 1000","tables":["sample_country_gdp"],"chart-type":"line"}
+  example_answer_2: |
+    {"success":true,"sql":"SELECT `country` AS `country_name`, `gdp` AS `gdp` FROM `Sample_Database`.`sample_country_gdp` WHERE `year` = '2024' ORDER BY `gdp` DESC","tables":["sample_country_gdp"],"chart-type":"pie"}
+  example_answer_2_with_limit: |
+    {"success":true,"sql":"SELECT `country` AS `country_name`, `gdp` AS `gdp` FROM `Sample_Database`.`sample_country_gdp` WHERE `year` = '2024' ORDER BY `gdp` DESC LIMIT 1000","tables":["sample_country_gdp"],"chart-type":"pie"}
+  example_answer_3: |
+    {"success":true,"sql":"SELECT `country` AS `country_name`, `gdp` AS `gdp` FROM `Sample_Database`.`sample_country_gdp` WHERE `year` = '2025' AND `country` = '中国'","tables":["sample_country_gdp"],"chart-type":"table"}
+  example_answer_3_with_limit: |
+    {"success":true,"sql":"SELECT `country` AS `country_name`, `gdp` AS `gdp` FROM `Sample_Database`.`sample_country_gdp` WHERE `year` = '2025' AND `country` = '中国' LIMIT 1000","tables":["sample_country_gdp"],"chart-type":"table"}