[improvement](statistics)Reduce partition column sample BE memory con…

…sumption. (#41203) For string type columns, use xxhash_64 to transfer column value to an integer, and then calculate the NDV based on the integer hash value. In this case, we can reduce the memory cost of sample analyze and improve the performance. For example, l_comment column of TPCH 100G lineitem table. The memory cost to calculate its NDV is reduced to 8GB from 22GB
apache · Sep 26, 2024 · 8e33cda · 8e33cda
1 parent 370a727
commit 8e33cda
Show file tree

Hide file tree

Showing 6 changed files with 55 additions and 49 deletions.
diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
@@ -98,30 +98,6 @@ public abstract class BaseAnalysisTask {
             + "NOW() "
             + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} ${sampleHints} ${limit}";
 
-    protected static final String DUJ1_ANALYZE_STRING_TEMPLATE = "SELECT "
-            + "CONCAT('${tblId}', '-', '${idxId}', '-', '${colId}') AS `id`, "
-            + "${catalogId} AS `catalog_id`, "
-            + "${dbId} AS `db_id`, "
-            + "${tblId} AS `tbl_id`, "
-            + "${idxId} AS `idx_id`, "
-            + "'${colId}' AS `col_id`, "
-            + "NULL AS `part_id`, "
-            + "${rowCount} AS `row_count`, "
-            + "${ndvFunction} as `ndv`, "
-            + "IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * ${scaleFactor} as `null_count`, "
-            + "SUBSTRING(CAST(${min} AS STRING), 1, 1024) AS `min`, "
-            + "SUBSTRING(CAST(${max} AS STRING), 1, 1024) AS `max`, "
-            + "${dataSizeFunction} * ${scaleFactor} AS `data_size`, "
-            + "NOW() "
-            + "FROM ( "
-            + "    SELECT t0.`colValue` as `column_key`, COUNT(1) as `count` "
-            + "    FROM "
-            + "    (SELECT SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024) AS `colValue` "
-            + "         FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} "
-            + "    ${sampleHints} ${limit}) as `t0` "
-            + "    GROUP BY `t0`.`colValue` "
-            + ") as `t1` ";
-
     protected static final String DUJ1_ANALYZE_TEMPLATE = "SELECT "
             + "CONCAT('${tblId}', '-', '${idxId}', '-', '${colId}') AS `id`, "
             + "${catalogId} AS `catalog_id`, "
@@ -138,11 +114,11 @@ public abstract class BaseAnalysisTask {
             + "${dataSizeFunction} * ${scaleFactor} AS `data_size`, "
             + "NOW() "
             + "FROM ( "
-            + "    SELECT t0.`${colName}` as `column_key`, COUNT(1) as `count` "
+            + "    SELECT t0.`colValue` as `column_key`, COUNT(1) as `count`, SUM(`len`) as `column_length` "
             + "    FROM "
-            + "    (SELECT `${colName}` FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} "
-            + "    ${sampleHints} ${limit}) as `t0` "
-            + "    GROUP BY `t0`.`${colName}` "
+            + "        (SELECT ${subStringColName} AS `colValue`, LENGTH(`${colName}`) as `len` "
+            + "        FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} ${sampleHints} ${limit}) as `t0` "
+            + "    GROUP BY `t0`.`colValue` "
             + ") as `t1` ";
 
     protected static final String ANALYZE_PARTITION_COLUMN_TEMPLATE = " SELECT "
@@ -283,7 +259,7 @@ public long getJobId() {
     protected String getDataSizeFunction(Column column, boolean useDuj1) {
         if (useDuj1) {
             if (column.getType().isStringType()) {
-                return "SUM(LENGTH(`column_key`) * count)";
+                return "SUM(`column_length`)";
             } else {
                 return "SUM(t1.count) * " + column.getType().getSlotSize();
             }
@@ -296,6 +272,14 @@ protected String getDataSizeFunction(Column column, boolean useDuj1) {
         }
     }
 
+    protected String getStringTypeColName(Column column) {
+        if (column.getType().isStringType()) {
+            return "xxhash_64(SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024))";
+        } else {
+            return "`${colName}`";
+        }
+    }
+
     protected String getMinFunction() {
         if (tableSample == null) {
             return "CAST(MIN(`${colName}`) as ${type}) ";

diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
@@ -249,11 +249,8 @@ protected void doSample() {
             params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})");
             params.put("rowCount", "ROUND(count(1) * ${scaleFactor})");
         } else {
-            if (col.getType().isStringType()) {
-                sb.append(DUJ1_ANALYZE_STRING_TEMPLATE);
-            } else {
-                sb.append(DUJ1_ANALYZE_TEMPLATE);
-            }
+            sb.append(DUJ1_ANALYZE_TEMPLATE);
+            params.put("subStringColName", getStringTypeColName(col));
             params.put("dataSizeFunction", getDataSizeFunction(col, true));
             params.put("ndvFunction", getNdvFunction("ROUND(SUM(t1.count) * ${scaleFactor})"));
             params.put("rowCount", "ROUND(SUM(t1.count) * ${scaleFactor})");

diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
@@ -156,11 +156,8 @@ protected void doSample() {
                 sql = stringSubstitutor.replace(LINEAR_ANALYZE_TEMPLATE);
             } else {
                 params.put("dataSizeFunction", getDataSizeFunction(col, true));
-                if (col.getType().isStringType()) {
-                    sql = stringSubstitutor.replace(DUJ1_ANALYZE_STRING_TEMPLATE);
-                } else {
-                    sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE);
-                }
+                params.put("subStringColName", getStringTypeColName(col));
+                sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE);
             }
             LOG.info("Sample for column [{}]. Total rows [{}], rows to sample [{}], scale factor [{}], "
                     + "limited [{}], distribute column [{}], partition column [{}], key column [{}], "

diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
@@ -31,7 +31,7 @@ public void testGetFunctions() {
         OlapAnalysisTask olapAnalysisTask = new OlapAnalysisTask();
         Column column = new Column("string_column", PrimitiveType.STRING);
         String dataSizeFunction = olapAnalysisTask.getDataSizeFunction(column, true);
-        Assertions.assertEquals("SUM(LENGTH(`column_key`) * count)", dataSizeFunction);
+        Assertions.assertEquals("SUM(`column_length`)", dataSizeFunction);
         dataSizeFunction = olapAnalysisTask.getDataSizeFunction(column, false);
         Assertions.assertEquals("SUM(LENGTH(`${colName}`))", dataSizeFunction);
 

diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
@@ -137,9 +137,11 @@ public void runQuery(String sql) {
                         + "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`,"
                         + " SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, "
                         + "SUM(t1.count) * 4 * 5.0 AS `data_size`, NOW() "
-                        + "FROM (     SELECT t0.`null` as `column_key`, COUNT(1) "
-                        + "as `count`     FROM     (SELECT `null` FROM `catalogName`.`${dbName}`.`null`"
-                        + "       limit 100) as `t0`     GROUP BY `t0`.`null` ) as `t1` ", sql);
+                        + "FROM (     SELECT t0.`colValue` as `column_key`, COUNT(1) "
+                        + "as `count`, SUM(`len`) as `column_length`     FROM         "
+                        + "(SELECT `null` AS `colValue`, LENGTH(`null`) as `len`         "
+                        + "FROM `catalogName`.`${dbName}`.`null`"
+                        + "   limit 100) as `t0`     GROUP BY `t0`.`colValue` ) as `t1` ", sql);
                 return;
             }
         };
@@ -296,10 +298,11 @@ public void runQuery(String sql) {
                         + "IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, "
                         + "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`, "
                         + "SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, "
-                        + "SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() "
-                        + "FROM (     SELECT t0.`colValue` as `column_key`, COUNT(1) as `count`     FROM     "
-                        + "(SELECT SUBSTRING(CAST(`null` AS STRING), 1, 1024) AS `colValue`          "
-                        + "FROM `catalogName`.`${dbName}`.`null`       limit 100) as `t0`     "
+                        + "SUM(`column_length`) * 5.0 AS `data_size`, NOW() "
+                        + "FROM (     SELECT t0.`colValue` as `column_key`, COUNT(1) as `count`, SUM(`len`) as "
+                        + "`column_length`     FROM         (SELECT xxhash_64(SUBSTRING(CAST(`null` AS STRING), 1, 1024)) "
+                        + "AS `colValue`, LENGTH(`null`) as `len`"
+                        + "         FROM `catalogName`.`${dbName}`.`null`   limit 100) as `t0`     "
                         + "GROUP BY `t0`.`colValue` ) as `t1` ", sql);
                 return;
             }

diff --git a/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy b/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy
@@ -18,7 +18,7 @@
 suite("test_hive_statistics_all_type_p0", "all_types,p0,external,hive,external_docker,external_docker_hive") {
     String enabled = context.config.otherConfigs.get("enableHiveTest")
     if (enabled == null || !enabled.equalsIgnoreCase("true")) {
-        logger.info("diable Hive test.")
+        logger.info("disable Hive test.")
         return;
     }
 
@@ -34,10 +34,35 @@ suite("test_hive_statistics_all_type_p0", "all_types,p0,external,hive,external_d
                 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}'
             );"""
             sql """use `${catalog_name}`.`default`"""
-            sql """analyze table orc_all_types with sync"""
+            sql """analyze table orc_all_types with sync with sample rows 4000000"""
             def result = sql """show column stats orc_all_types;"""
             assertEquals(16, result.size())
 
+            result = sql """show column stats orc_all_types (int_col);"""
+            assertEquals("int_col", result[0][0])
+            assertEquals("3600.0", result[0][2])
+            assertEquals("3240.0", result[0][3])
+            assertEquals("361.0", result[0][4])
+            assertEquals("14400.0", result[0][5])
+
+            result = sql """show column stats orc_all_types (string_col);"""
+            assertEquals("string_col", result[0][0])
+            assertEquals("3600.0", result[0][2])
+            assertEquals("3254.0", result[0][3])
+            assertEquals("347.0", result[0][4])
+            assertEquals("453634.0", result[0][5])
+
+            result = sql """show column stats orc_all_types (varchar_col);"""
+            assertEquals("varchar_col", result[0][0])
+            assertEquals("3600.0", result[0][2])
+            assertEquals("6.0", result[0][3])
+            assertEquals("0.0", result[0][4])
+            assertEquals("35950.0", result[0][5])
+
+            sql """drop stats orc_all_types"""
+            sql """analyze table orc_all_types with sync"""
+            result = sql """show column stats orc_all_types;"""
+            assertEquals(16, result.size())
             result = sql """show column stats orc_all_types (int_col);"""
             assertEquals("int_col", result[0][0])
             assertEquals("3600.0", result[0][2])