Skip to content

Commit

Permalink
[improvement](statistics)Reduce partition column sample BE memory con…
Browse files Browse the repository at this point in the history
…sumption. (#41203)

For string type columns, use xxhash_64 to transfer column value to an
integer, and then calculate the NDV based on the integer hash value. In
this case, we can reduce the memory cost of sample analyze and improve
the performance.
For example, l_comment column of TPCH 100G lineitem table. The memory
cost to calculate its NDV is reduced to 8GB from 22GB
  • Loading branch information
Jibing-Li authored Sep 26, 2024
1 parent 370a727 commit 8e33cda
Show file tree
Hide file tree
Showing 6 changed files with 55 additions and 49 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -98,30 +98,6 @@ public abstract class BaseAnalysisTask {
+ "NOW() "
+ "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} ${sampleHints} ${limit}";

protected static final String DUJ1_ANALYZE_STRING_TEMPLATE = "SELECT "
+ "CONCAT('${tblId}', '-', '${idxId}', '-', '${colId}') AS `id`, "
+ "${catalogId} AS `catalog_id`, "
+ "${dbId} AS `db_id`, "
+ "${tblId} AS `tbl_id`, "
+ "${idxId} AS `idx_id`, "
+ "'${colId}' AS `col_id`, "
+ "NULL AS `part_id`, "
+ "${rowCount} AS `row_count`, "
+ "${ndvFunction} as `ndv`, "
+ "IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * ${scaleFactor} as `null_count`, "
+ "SUBSTRING(CAST(${min} AS STRING), 1, 1024) AS `min`, "
+ "SUBSTRING(CAST(${max} AS STRING), 1, 1024) AS `max`, "
+ "${dataSizeFunction} * ${scaleFactor} AS `data_size`, "
+ "NOW() "
+ "FROM ( "
+ " SELECT t0.`colValue` as `column_key`, COUNT(1) as `count` "
+ " FROM "
+ " (SELECT SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024) AS `colValue` "
+ " FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} "
+ " ${sampleHints} ${limit}) as `t0` "
+ " GROUP BY `t0`.`colValue` "
+ ") as `t1` ";

protected static final String DUJ1_ANALYZE_TEMPLATE = "SELECT "
+ "CONCAT('${tblId}', '-', '${idxId}', '-', '${colId}') AS `id`, "
+ "${catalogId} AS `catalog_id`, "
Expand All @@ -138,11 +114,11 @@ public abstract class BaseAnalysisTask {
+ "${dataSizeFunction} * ${scaleFactor} AS `data_size`, "
+ "NOW() "
+ "FROM ( "
+ " SELECT t0.`${colName}` as `column_key`, COUNT(1) as `count` "
+ " SELECT t0.`colValue` as `column_key`, COUNT(1) as `count`, SUM(`len`) as `column_length` "
+ " FROM "
+ " (SELECT `${colName}` FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} "
+ " ${sampleHints} ${limit}) as `t0` "
+ " GROUP BY `t0`.`${colName}` "
+ " (SELECT ${subStringColName} AS `colValue`, LENGTH(`${colName}`) as `len` "
+ " FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} ${sampleHints} ${limit}) as `t0` "
+ " GROUP BY `t0`.`colValue` "
+ ") as `t1` ";

protected static final String ANALYZE_PARTITION_COLUMN_TEMPLATE = " SELECT "
Expand Down Expand Up @@ -283,7 +259,7 @@ public long getJobId() {
protected String getDataSizeFunction(Column column, boolean useDuj1) {
if (useDuj1) {
if (column.getType().isStringType()) {
return "SUM(LENGTH(`column_key`) * count)";
return "SUM(`column_length`)";
} else {
return "SUM(t1.count) * " + column.getType().getSlotSize();
}
Expand All @@ -296,6 +272,14 @@ protected String getDataSizeFunction(Column column, boolean useDuj1) {
}
}

protected String getStringTypeColName(Column column) {
if (column.getType().isStringType()) {
return "xxhash_64(SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024))";
} else {
return "`${colName}`";
}
}

protected String getMinFunction() {
if (tableSample == null) {
return "CAST(MIN(`${colName}`) as ${type}) ";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -249,11 +249,8 @@ protected void doSample() {
params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})");
params.put("rowCount", "ROUND(count(1) * ${scaleFactor})");
} else {
if (col.getType().isStringType()) {
sb.append(DUJ1_ANALYZE_STRING_TEMPLATE);
} else {
sb.append(DUJ1_ANALYZE_TEMPLATE);
}
sb.append(DUJ1_ANALYZE_TEMPLATE);
params.put("subStringColName", getStringTypeColName(col));
params.put("dataSizeFunction", getDataSizeFunction(col, true));
params.put("ndvFunction", getNdvFunction("ROUND(SUM(t1.count) * ${scaleFactor})"));
params.put("rowCount", "ROUND(SUM(t1.count) * ${scaleFactor})");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,11 +156,8 @@ protected void doSample() {
sql = stringSubstitutor.replace(LINEAR_ANALYZE_TEMPLATE);
} else {
params.put("dataSizeFunction", getDataSizeFunction(col, true));
if (col.getType().isStringType()) {
sql = stringSubstitutor.replace(DUJ1_ANALYZE_STRING_TEMPLATE);
} else {
sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE);
}
params.put("subStringColName", getStringTypeColName(col));
sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE);
}
LOG.info("Sample for column [{}]. Total rows [{}], rows to sample [{}], scale factor [{}], "
+ "limited [{}], distribute column [{}], partition column [{}], key column [{}], "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ public void testGetFunctions() {
OlapAnalysisTask olapAnalysisTask = new OlapAnalysisTask();
Column column = new Column("string_column", PrimitiveType.STRING);
String dataSizeFunction = olapAnalysisTask.getDataSizeFunction(column, true);
Assertions.assertEquals("SUM(LENGTH(`column_key`) * count)", dataSizeFunction);
Assertions.assertEquals("SUM(`column_length`)", dataSizeFunction);
dataSizeFunction = olapAnalysisTask.getDataSizeFunction(column, false);
Assertions.assertEquals("SUM(LENGTH(`${colName}`))", dataSizeFunction);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,11 @@ public void runQuery(String sql) {
+ "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`,"
+ " SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, "
+ "SUM(t1.count) * 4 * 5.0 AS `data_size`, NOW() "
+ "FROM ( SELECT t0.`null` as `column_key`, COUNT(1) "
+ "as `count` FROM (SELECT `null` FROM `catalogName`.`${dbName}`.`null`"
+ " limit 100) as `t0` GROUP BY `t0`.`null` ) as `t1` ", sql);
+ "FROM ( SELECT t0.`colValue` as `column_key`, COUNT(1) "
+ "as `count`, SUM(`len`) as `column_length` FROM "
+ "(SELECT `null` AS `colValue`, LENGTH(`null`) as `len` "
+ "FROM `catalogName`.`${dbName}`.`null`"
+ " limit 100) as `t0` GROUP BY `t0`.`colValue` ) as `t1` ", sql);
return;
}
};
Expand Down Expand Up @@ -296,10 +298,11 @@ public void runQuery(String sql) {
+ "IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, "
+ "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`, "
+ "SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, "
+ "SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() "
+ "FROM ( SELECT t0.`colValue` as `column_key`, COUNT(1) as `count` FROM "
+ "(SELECT SUBSTRING(CAST(`null` AS STRING), 1, 1024) AS `colValue` "
+ "FROM `catalogName`.`${dbName}`.`null` limit 100) as `t0` "
+ "SUM(`column_length`) * 5.0 AS `data_size`, NOW() "
+ "FROM ( SELECT t0.`colValue` as `column_key`, COUNT(1) as `count`, SUM(`len`) as "
+ "`column_length` FROM (SELECT xxhash_64(SUBSTRING(CAST(`null` AS STRING), 1, 1024)) "
+ "AS `colValue`, LENGTH(`null`) as `len`"
+ " FROM `catalogName`.`${dbName}`.`null` limit 100) as `t0` "
+ "GROUP BY `t0`.`colValue` ) as `t1` ", sql);
return;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
suite("test_hive_statistics_all_type_p0", "all_types,p0,external,hive,external_docker,external_docker_hive") {
String enabled = context.config.otherConfigs.get("enableHiveTest")
if (enabled == null || !enabled.equalsIgnoreCase("true")) {
logger.info("diable Hive test.")
logger.info("disable Hive test.")
return;
}

Expand All @@ -34,10 +34,35 @@ suite("test_hive_statistics_all_type_p0", "all_types,p0,external,hive,external_d
'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}'
);"""
sql """use `${catalog_name}`.`default`"""
sql """analyze table orc_all_types with sync"""
sql """analyze table orc_all_types with sync with sample rows 4000000"""
def result = sql """show column stats orc_all_types;"""
assertEquals(16, result.size())

result = sql """show column stats orc_all_types (int_col);"""
assertEquals("int_col", result[0][0])
assertEquals("3600.0", result[0][2])
assertEquals("3240.0", result[0][3])
assertEquals("361.0", result[0][4])
assertEquals("14400.0", result[0][5])

result = sql """show column stats orc_all_types (string_col);"""
assertEquals("string_col", result[0][0])
assertEquals("3600.0", result[0][2])
assertEquals("3254.0", result[0][3])
assertEquals("347.0", result[0][4])
assertEquals("453634.0", result[0][5])

result = sql """show column stats orc_all_types (varchar_col);"""
assertEquals("varchar_col", result[0][0])
assertEquals("3600.0", result[0][2])
assertEquals("6.0", result[0][3])
assertEquals("0.0", result[0][4])
assertEquals("35950.0", result[0][5])

sql """drop stats orc_all_types"""
sql """analyze table orc_all_types with sync"""
result = sql """show column stats orc_all_types;"""
assertEquals(16, result.size())
result = sql """show column stats orc_all_types (int_col);"""
assertEquals("int_col", result[0][0])
assertEquals("3600.0", result[0][2])
Expand Down

0 comments on commit 8e33cda

Please sign in to comment.