From 6bd8aa9c00c4fb1d0811c27dde9e2dd31cc9b505 Mon Sep 17 00:00:00 2001 From: Anupam Yadav Date: Sun, 7 Jun 2026 02:05:24 +0000 Subject: [PATCH 1/3] GH-3601: Compute shouldIgnoreStatistics once per file in ParquetMetadataConverter --- .../converter/ParquetMetadataConverter.java | 60 ++++++++++++++++++- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 50c2e344e2..a56b531a96 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -994,6 +994,46 @@ public org.apache.parquet.column.statistics.Statistics fromParquetStatistics( return fromParquetStatisticsInternal(createdBy, statistics, type, expectedOrder); } + // Overload that uses a pre-computed shouldIgnoreCorruptStats flag to avoid redundant parsing + private org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal( + String createdBy, Statistics formatStats, PrimitiveType type, boolean shouldIgnoreCorruptStats) { + SortOrder typeSortOrder = overrideSortOrderToSigned(type) ? SortOrder.SIGNED : sortOrder(type); + org.apache.parquet.column.statistics.Statistics.Builder statsBuilder = + org.apache.parquet.column.statistics.Statistics.getBuilderForReading(type); + + if (formatStats != null) { + if (formatStats.isSetMin_value() && formatStats.isSetMax_value()) { + byte[] min = formatStats.min_value.array(); + byte[] max = formatStats.max_value.array(); + if (isMinMaxStatsSupported(type) || Arrays.equals(min, max)) { + statsBuilder.withMin(min); + statsBuilder.withMax(max); + } + } else { + boolean isSet = formatStats.isSetMax() && formatStats.isSetMin(); + boolean maxEqualsMin = isSet ? Arrays.equals(formatStats.getMin(), formatStats.getMax()) : false; + boolean sortOrdersMatch = SortOrder.SIGNED == typeSortOrder; + // The shouldIgnoreCorruptStats flag applies only to BINARY and FIXED_LEN_BYTE_ARRAY. + // For other types, shouldIgnoreStatistics always returns false, so we only guard those. + PrimitiveTypeName primitiveTypeName = type.getPrimitiveTypeName(); + boolean ignoreForThisColumn = shouldIgnoreCorruptStats + && (primitiveTypeName == PrimitiveTypeName.BINARY + || primitiveTypeName == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY); + if (!ignoreForThisColumn && (sortOrdersMatch || maxEqualsMin)) { + if (isSet) { + statsBuilder.withMin(formatStats.min.array()); + statsBuilder.withMax(formatStats.max.array()); + } + } + } + + if (formatStats.isSetNull_count()) { + statsBuilder.withNumNulls(formatStats.null_count); + } + } + return statsBuilder.build(); + } + GeospatialStatistics toParquetGeospatialStatistics( org.apache.parquet.column.statistics.geospatial.GeospatialStatistics geospatialStatistics) { if (geospatialStatistics == null) { @@ -1818,13 +1858,24 @@ public FileMetaDataAndRowGroupOffsetInfo visit(RangeMetadataFilter filter) throw public ColumnChunkMetaData buildColumnChunkMetaData( ColumnMetaData metaData, ColumnPath columnPath, PrimitiveType type, String createdBy) { + boolean shouldIgnoreCorruptStats = + CorruptStatistics.shouldIgnoreStatistics(createdBy, PrimitiveTypeName.BINARY); + return buildColumnChunkMetaData(metaData, columnPath, type, createdBy, shouldIgnoreCorruptStats); + } + + ColumnChunkMetaData buildColumnChunkMetaData( + ColumnMetaData metaData, + ColumnPath columnPath, + PrimitiveType type, + String createdBy, + boolean shouldIgnoreCorruptStats) { return ColumnChunkMetaData.get( columnPath, type, fromFormatCodec(metaData.codec), convertEncodingStats(metaData.getEncoding_stats()), fromFormatEncodings(metaData.encodings), - fromParquetStatistics(createdBy, metaData.statistics, type), + fromParquetStatisticsInternal(createdBy, metaData.statistics, type, shouldIgnoreCorruptStats), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, @@ -1853,6 +1904,10 @@ public ParquetMetadata fromParquetMetadata( MessageType messageType = fromParquetSchema(parquetMetadata.getSchema(), parquetMetadata.getColumn_orders()); List blocks = new ArrayList(); List row_groups = parquetMetadata.getRow_groups(); + // Compute once per file: the result is the same for BINARY and FIXED_LEN_BYTE_ARRAY + // (the only types affected by PARQUET-251), and always false for other types. + boolean shouldIgnoreCorruptStats = + CorruptStatistics.shouldIgnoreStatistics(parquetMetadata.getCreated_by(), PrimitiveTypeName.BINARY); if (row_groups != null) { for (RowGroup rowGroup : row_groups) { @@ -1933,7 +1988,8 @@ public ParquetMetadata fromParquetMetadata( metaData, columnPath, messageType.getType(columnPath.toArray()).asPrimitiveType(), - createdBy); + createdBy, + shouldIgnoreCorruptStats); column.setRowGroupOrdinal(rowGroup.getOrdinal()); if (metaData.isSetBloom_filter_offset()) { column.setBloomFilterOffset(metaData.getBloom_filter_offset()); From 9c5df2925f9c8cdddf73094f1392564e208e9477 Mon Sep 17 00:00:00 2001 From: Anupam Yadav Date: Sat, 20 Jun 2026 07:24:00 +0000 Subject: [PATCH 2/3] GH-3601: Address review - factor CorruptStatistics into file-level + column-type checks; precompute once per file (schema-gated) --- .../org/apache/parquet/CorruptStatistics.java | 43 +++++--- .../apache/parquet/CorruptStatisticsTest.java | 30 ++++++ .../converter/ParquetMetadataConverter.java | 88 +++++++---------- .../TestParquetMetadataConverter.java | 99 +++++++++++++++++++ 4 files changed, 195 insertions(+), 65 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java b/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java index c5846f9efa..79db8ccd2a 100644 --- a/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java @@ -47,20 +47,25 @@ public class CorruptStatistics { private static final SemanticVersion CDH_5_PARQUET_251_FIXED_END = new SemanticVersion(1, 5, 0); /** - * Decides if the statistics from a file created by createdBy (the created_by field from parquet format) - * should be ignored because they are potentially corrupt. + * Returns whether the given column type is one of the types affected by the PARQUET-251 bug + * (BINARY or FIXED_LEN_BYTE_ARRAY). * - * @param createdBy the created-by string from a file footer - * @param columnType the type of the column that this is checking - * @return true if the statistics may be invalid and should be ignored, false otherwise + * @param columnType the primitive type of the column + * @return true if this column type could have corrupt statistics */ - public static boolean shouldIgnoreStatistics(String createdBy, PrimitiveTypeName columnType) { - - if (columnType != PrimitiveTypeName.BINARY && columnType != PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) { - // the bug only applies to binary columns - return false; - } + public static boolean isCorruptStatisticsColumnType(PrimitiveTypeName columnType) { + return columnType == PrimitiveTypeName.BINARY || columnType == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; + } + /** + * Determines whether a file (identified by its created_by string) was written by a version of + * parquet-mr that had the PARQUET-251 statistics bug. This is a file-level check that does not + * consider column type. + * + * @param createdBy the created-by string from a file footer + * @return true if the file was written by a version with the corrupt statistics bug + */ + public static boolean fileHasCorruptStatistics(String createdBy) { if (Strings.isNullOrEmpty(createdBy)) { // created_by is not populated, which could have been caused by // parquet-mr during the same time as PARQUET-251, see PARQUET-297 @@ -103,6 +108,22 @@ public static boolean shouldIgnoreStatistics(String createdBy, PrimitiveTypeName } } + /** + * Decides if the statistics from a file created by createdBy (the created_by field from parquet format) + * should be ignored because they are potentially corrupt. + * + * @param createdBy the created-by string from a file footer + * @param columnType the type of the column that this is checking + * @return true if the statistics may be invalid and should be ignored, false otherwise + */ + public static boolean shouldIgnoreStatistics(String createdBy, PrimitiveTypeName columnType) { + if (!isCorruptStatisticsColumnType(columnType)) { + // the bug only applies to binary columns + return false; + } + return fileHasCorruptStatistics(createdBy); + } + private static void warnParseErrorOnce(String createdBy, Throwable e) { if (!alreadyLogged.getAndSet(true)) { LOG.warn("Ignoring statistics because created_by could not be parsed (see PARQUET-251): " + createdBy, e); diff --git a/parquet-column/src/test/java/org/apache/parquet/CorruptStatisticsTest.java b/parquet-column/src/test/java/org/apache/parquet/CorruptStatisticsTest.java index e897b38512..4287c11e73 100644 --- a/parquet-column/src/test/java/org/apache/parquet/CorruptStatisticsTest.java +++ b/parquet-column/src/test/java/org/apache/parquet/CorruptStatisticsTest.java @@ -124,4 +124,34 @@ public void testDistributionCorruptStatistics() { assertTrue(CorruptStatistics.shouldIgnoreStatistics( "parquet-mr version 1.7.0 (build abcd)", PrimitiveTypeName.BINARY)); } + + @Test + public void testIsCorruptStatisticsColumnType() { + assertTrue(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.BINARY)); + assertTrue(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY)); + assertFalse(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.INT32)); + assertFalse(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.INT64)); + assertFalse(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.DOUBLE)); + assertFalse(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.FLOAT)); + assertFalse(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.BOOLEAN)); + assertFalse(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.INT96)); + } + + @Test + public void testFileHasCorruptStatistics() { + // Corrupt versions + assertTrue(CorruptStatistics.fileHasCorruptStatistics("parquet-mr version 1.6.0 (build abcd)")); + assertTrue(CorruptStatistics.fileHasCorruptStatistics("parquet-mr version 1.4.2 (build abcd)")); + assertTrue(CorruptStatistics.fileHasCorruptStatistics("parquet-mr version 1.7.999 (build abcd)")); + // Null/empty + assertTrue(CorruptStatistics.fileHasCorruptStatistics(null)); + assertTrue(CorruptStatistics.fileHasCorruptStatistics("")); + // Unparseable + assertTrue(CorruptStatistics.fileHasCorruptStatistics("unparseable string")); + // Fixed versions + assertFalse(CorruptStatistics.fileHasCorruptStatistics("parquet-mr version 1.8.0 (build abcd)")); + assertFalse(CorruptStatistics.fileHasCorruptStatistics("parquet-mr version 2.0.0 (build abcd)")); + // Non-parquet-mr applications + assertFalse(CorruptStatistics.fileHasCorruptStatistics("impala version 1.6.0 (build abcd)")); + } } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index a56b531a96..349d70e01b 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -947,6 +947,13 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist // Visible for testing static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal( String createdBy, Statistics formatStats, PrimitiveType type, SortOrder typeSortOrder) { + boolean fileHasCorruptStats = CorruptStatistics.fileHasCorruptStatistics(createdBy); + return fromParquetStatisticsInternal(formatStats, type, typeSortOrder, fileHasCorruptStats); + } + + // Core implementation using pre-computed file-level corrupt stats flag + static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal( + Statistics formatStats, PrimitiveType type, SortOrder typeSortOrder, boolean fileHasCorruptStats) { // create stats object based on the column type org.apache.parquet.column.statistics.Statistics.Builder statsBuilder = org.apache.parquet.column.statistics.Statistics.getBuilderForReading(type); @@ -969,8 +976,10 @@ static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInte // valid with the type's sort order. In previous releases, all stats were // aggregated using a signed byte-wise ordering, which isn't valid for all the // types (e.g. strings, decimals etc.). - if (!CorruptStatistics.shouldIgnoreStatistics(createdBy, type.getPrimitiveTypeName()) - && (sortOrdersMatch || maxEqualsMin)) { + // The fileHasCorruptStats flag applies only to BINARY and FIXED_LEN_BYTE_ARRAY columns. + boolean ignoreForThisColumn = + fileHasCorruptStats && CorruptStatistics.isCorruptStatisticsColumnType(type.getPrimitiveTypeName()); + if (!ignoreForThisColumn && (sortOrdersMatch || maxEqualsMin)) { if (isSet) { statsBuilder.withMin(formatStats.min.array()); statsBuilder.withMax(formatStats.max.array()); @@ -994,46 +1003,6 @@ public org.apache.parquet.column.statistics.Statistics fromParquetStatistics( return fromParquetStatisticsInternal(createdBy, statistics, type, expectedOrder); } - // Overload that uses a pre-computed shouldIgnoreCorruptStats flag to avoid redundant parsing - private org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal( - String createdBy, Statistics formatStats, PrimitiveType type, boolean shouldIgnoreCorruptStats) { - SortOrder typeSortOrder = overrideSortOrderToSigned(type) ? SortOrder.SIGNED : sortOrder(type); - org.apache.parquet.column.statistics.Statistics.Builder statsBuilder = - org.apache.parquet.column.statistics.Statistics.getBuilderForReading(type); - - if (formatStats != null) { - if (formatStats.isSetMin_value() && formatStats.isSetMax_value()) { - byte[] min = formatStats.min_value.array(); - byte[] max = formatStats.max_value.array(); - if (isMinMaxStatsSupported(type) || Arrays.equals(min, max)) { - statsBuilder.withMin(min); - statsBuilder.withMax(max); - } - } else { - boolean isSet = formatStats.isSetMax() && formatStats.isSetMin(); - boolean maxEqualsMin = isSet ? Arrays.equals(formatStats.getMin(), formatStats.getMax()) : false; - boolean sortOrdersMatch = SortOrder.SIGNED == typeSortOrder; - // The shouldIgnoreCorruptStats flag applies only to BINARY and FIXED_LEN_BYTE_ARRAY. - // For other types, shouldIgnoreStatistics always returns false, so we only guard those. - PrimitiveTypeName primitiveTypeName = type.getPrimitiveTypeName(); - boolean ignoreForThisColumn = shouldIgnoreCorruptStats - && (primitiveTypeName == PrimitiveTypeName.BINARY - || primitiveTypeName == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY); - if (!ignoreForThisColumn && (sortOrdersMatch || maxEqualsMin)) { - if (isSet) { - statsBuilder.withMin(formatStats.min.array()); - statsBuilder.withMax(formatStats.max.array()); - } - } - } - - if (formatStats.isSetNull_count()) { - statsBuilder.withNumNulls(formatStats.null_count); - } - } - return statsBuilder.build(); - } - GeospatialStatistics toParquetGeospatialStatistics( org.apache.parquet.column.statistics.geospatial.GeospatialStatistics geospatialStatistics) { if (geospatialStatistics == null) { @@ -1858,24 +1827,23 @@ public FileMetaDataAndRowGroupOffsetInfo visit(RangeMetadataFilter filter) throw public ColumnChunkMetaData buildColumnChunkMetaData( ColumnMetaData metaData, ColumnPath columnPath, PrimitiveType type, String createdBy) { - boolean shouldIgnoreCorruptStats = - CorruptStatistics.shouldIgnoreStatistics(createdBy, PrimitiveTypeName.BINARY); - return buildColumnChunkMetaData(metaData, columnPath, type, createdBy, shouldIgnoreCorruptStats); + boolean fileHasCorruptStats = CorruptStatistics.fileHasCorruptStatistics(createdBy); + return buildColumnChunkMetaData(metaData, columnPath, type, fileHasCorruptStats); } ColumnChunkMetaData buildColumnChunkMetaData( ColumnMetaData metaData, ColumnPath columnPath, PrimitiveType type, - String createdBy, - boolean shouldIgnoreCorruptStats) { + boolean fileHasCorruptStats) { + SortOrder typeSortOrder = overrideSortOrderToSigned(type) ? SortOrder.SIGNED : sortOrder(type); return ColumnChunkMetaData.get( columnPath, type, fromFormatCodec(metaData.codec), convertEncodingStats(metaData.getEncoding_stats()), fromFormatEncodings(metaData.encodings), - fromParquetStatisticsInternal(createdBy, metaData.statistics, type, shouldIgnoreCorruptStats), + fromParquetStatisticsInternal(metaData.statistics, type, typeSortOrder, fileHasCorruptStats), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, @@ -1904,10 +1872,11 @@ public ParquetMetadata fromParquetMetadata( MessageType messageType = fromParquetSchema(parquetMetadata.getSchema(), parquetMetadata.getColumn_orders()); List blocks = new ArrayList(); List row_groups = parquetMetadata.getRow_groups(); - // Compute once per file: the result is the same for BINARY and FIXED_LEN_BYTE_ARRAY - // (the only types affected by PARQUET-251), and always false for other types. - boolean shouldIgnoreCorruptStats = - CorruptStatistics.shouldIgnoreStatistics(parquetMetadata.getCreated_by(), PrimitiveTypeName.BINARY); + // Compute once per file: whether this file was written by a version with the PARQUET-251 bug. + // Only parse created_by if the schema has columns affected by the bug (BINARY/FIXED_LEN_BYTE_ARRAY). + // The per-column type check is applied later when statistics are actually read. + boolean fileHasCorruptStats = schemaHasCorruptStatisticsColumnType(messageType) + && CorruptStatistics.fileHasCorruptStatistics(parquetMetadata.getCreated_by()); if (row_groups != null) { for (RowGroup rowGroup : row_groups) { @@ -1988,8 +1957,7 @@ public ParquetMetadata fromParquetMetadata( metaData, columnPath, messageType.getType(columnPath.toArray()).asPrimitiveType(), - createdBy, - shouldIgnoreCorruptStats); + fileHasCorruptStats); column.setRowGroupOrdinal(rowGroup.getOrdinal()); if (metaData.isSetBloom_filter_offset()) { column.setBloomFilterOffset(metaData.getBloom_filter_offset()); @@ -2068,6 +2036,18 @@ private static ColumnPath getPath(ColumnMetaData metaData) { return ColumnPath.get(path); } + /** + * Returns true if the schema contains at least one column with a type affected by the PARQUET-251 bug. + */ + private static boolean schemaHasCorruptStatisticsColumnType(MessageType schema) { + for (ColumnDescriptor column : schema.getColumns()) { + if (CorruptStatistics.isCorruptStatisticsColumnType(column.getPrimitiveType().getPrimitiveTypeName())) { + return true; + } + } + return false; + } + // Visible for testing MessageType fromParquetSchema(List schema, List columnOrders) { Iterator iterator = schema.iterator(); diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index 8d778f7b91..bdb9325324 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -2158,5 +2158,104 @@ public void testColumnIndexNanCountsRoundTrip() { ColumnIndex roundTrip = ParquetMetadataConverter.fromParquetColumnIndex(type, parquetColumnIndex); assertNotNull(roundTrip); assertEquals(List.of(1L, 0L, 0L), roundTrip.getNanCounts()); + + public void testCorruptStatsPerColumnGate() { + // A created_by string from a version known to have the PARQUET-251 bug + String corruptCreatedBy = "parquet-mr version 1.6.0 (build abcd)"; + + // Set up legacy V1 statistics with min/max (not min_value/max_value) + org.apache.parquet.format.Statistics formatStats = new org.apache.parquet.format.Statistics(); + byte[] minBytes = new byte[] {0, 1, 2, 3}; + byte[] maxBytes = new byte[] {4, 5, 6, 7}; + formatStats.setMin(minBytes); + formatStats.setMax(maxBytes); + formatStats.setNull_count(5); + + // For BINARY column: stats should be ignored (only null_count preserved) + PrimitiveType binaryType = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, "bin_col"); + Statistics binaryStats = ParquetMetadataConverter.fromParquetStatisticsInternal( + corruptCreatedBy, formatStats, binaryType, ParquetMetadataConverter.SortOrder.SIGNED); + assertFalse("BINARY min/max should be ignored for corrupt file", binaryStats.hasNonNullValue()); + assertEquals(5, binaryStats.getNumNulls()); + + // For FIXED_LEN_BYTE_ARRAY column: stats should also be ignored + PrimitiveType fixedType = + new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, 4, "fixed_col"); + Statistics fixedStats = ParquetMetadataConverter.fromParquetStatisticsInternal( + corruptCreatedBy, formatStats, fixedType, ParquetMetadataConverter.SortOrder.SIGNED); + assertFalse("FIXED_LEN_BYTE_ARRAY min/max should be ignored for corrupt file", fixedStats.hasNonNullValue()); + assertEquals(5, fixedStats.getNumNulls()); + + // For INT32 column: stats should NOT be ignored (per-column gate) + PrimitiveType int32Type = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.INT32, "int_col"); + Statistics int32Stats = ParquetMetadataConverter.fromParquetStatisticsInternal( + corruptCreatedBy, formatStats, int32Type, ParquetMetadataConverter.SortOrder.SIGNED); + assertTrue("INT32 min/max should NOT be ignored for corrupt file", int32Stats.hasNonNullValue()); + assertEquals(5, int32Stats.getNumNulls()); + + // For INT64 column: stats should NOT be ignored + org.apache.parquet.format.Statistics formatStatsLong = new org.apache.parquet.format.Statistics(); + byte[] minLong = new byte[] {0, 0, 0, 0, 0, 0, 0, 1}; + byte[] maxLong = new byte[] {0, 0, 0, 0, 0, 0, 0, 7}; + formatStatsLong.setMin(minLong); + formatStatsLong.setMax(maxLong); + formatStatsLong.setNull_count(5); + PrimitiveType int64Type = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.INT64, "long_col"); + Statistics int64Stats = ParquetMetadataConverter.fromParquetStatisticsInternal( + corruptCreatedBy, formatStatsLong, int64Type, ParquetMetadataConverter.SortOrder.SIGNED); + assertTrue("INT64 min/max should NOT be ignored for corrupt file", int64Stats.hasNonNullValue()); + assertEquals(5, int64Stats.getNumNulls()); + + // For a non-corrupt file, BINARY stats should be preserved + String goodCreatedBy = "parquet-mr version 1.8.0 (build abcd)"; + Statistics binaryStatsGood = ParquetMetadataConverter.fromParquetStatisticsInternal( + goodCreatedBy, formatStats, binaryType, ParquetMetadataConverter.SortOrder.SIGNED); + assertTrue("BINARY min/max should be kept for non-corrupt file", binaryStatsGood.hasNonNullValue()); + } + + @Test + public void testSchemaGateSkipsCorruptStatsCheckForNonBinarySchema() throws Exception { + // A file with ONLY INT32/INT64 columns should never trigger corrupt stats detection, + // even with a known-corrupt createdBy string. + String corruptCreatedBy = "parquet-mr version 1.6.0 (build abcd)"; + + MessageType intOnlySchema = Types.buildMessage() + .required(PrimitiveTypeName.INT32).named("id") + .required(PrimitiveTypeName.INT64).named("ts") + .named("msg"); + + ParquetMetadataConverter converter = new ParquetMetadataConverter(); + List schemaElements = converter.toParquetSchema(intOnlySchema); + + // Build ColumnMetaData with V1 statistics for the INT32 column + org.apache.parquet.format.Statistics stats = new org.apache.parquet.format.Statistics(); + stats.setMin(new byte[] {0, 0, 0, 1}); + stats.setMax(new byte[] {0, 0, 0, 7}); + stats.setNull_count(0); + + ColumnMetaData cmd = new ColumnMetaData( + Type.INT32, + Collections.emptyList(), + Collections.singletonList("id"), + UNCOMPRESSED, + 100L, 200L, 100L, 0L); + cmd.setStatistics(stats); + + ColumnChunk cc = new ColumnChunk(0L); + cc.setMeta_data(cmd); + RowGroup rg = new RowGroup(List.of(cc), 100L, 1); + + FileMetaData fmd = new FileMetaData(1, schemaElements, 1, List.of(rg)); + fmd.setCreated_by(corruptCreatedBy); + + // Parse via fromParquetMetadata – the schema gate should prevent fileHasCorruptStats + ParquetMetadata metadata = converter.fromParquetMetadata(fmd); + Statistics columnStats = + metadata.getBlocks().get(0).getColumns().get(0).getStatistics(); + + // Stats should be preserved (not ignored) because schema has no BINARY/FIXED columns + assertTrue( + "INT32 stats should be preserved when schema has no corrupt-stats-affected columns", + columnStats.hasNonNullValue()); } } From 4bc68a93ec0ea0438781c23c1c793e0ec343020c Mon Sep 17 00:00:00 2001 From: Anupam Yadav Date: Thu, 2 Jul 2026 20:54:53 +0000 Subject: [PATCH 3/3] GH-3601: Rename to mayHaveCorruptStatistics and minimize new public API --- .../org/apache/parquet/CorruptStatistics.java | 17 ++++---- .../apache/parquet/CorruptStatisticsTest.java | 41 ++++++++++--------- .../converter/ParquetMetadataConverter.java | 19 ++++----- .../TestParquetMetadataConverter.java | 13 ++++-- 4 files changed, 48 insertions(+), 42 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java b/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java index 79db8ccd2a..1957943d6a 100644 --- a/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java @@ -49,23 +49,20 @@ public class CorruptStatistics { /** * Returns whether the given column type is one of the types affected by the PARQUET-251 bug * (BINARY or FIXED_LEN_BYTE_ARRAY). - * - * @param columnType the primitive type of the column - * @return true if this column type could have corrupt statistics */ - public static boolean isCorruptStatisticsColumnType(PrimitiveTypeName columnType) { + private static boolean isCorruptStatisticsColumnType(PrimitiveTypeName columnType) { return columnType == PrimitiveTypeName.BINARY || columnType == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; } /** - * Determines whether a file (identified by its created_by string) was written by a version of - * parquet-mr that had the PARQUET-251 statistics bug. This is a file-level check that does not - * consider column type. + * Determines whether a file (identified by its created_by string) may have been written by a + * version of parquet-mr that had the PARQUET-251 statistics bug. This is a file-level check + * that does not consider column type. * * @param createdBy the created-by string from a file footer - * @return true if the file was written by a version with the corrupt statistics bug + * @return true if the file may have been written by a version with the corrupt statistics bug */ - public static boolean fileHasCorruptStatistics(String createdBy) { + public static boolean mayHaveCorruptStatistics(String createdBy) { if (Strings.isNullOrEmpty(createdBy)) { // created_by is not populated, which could have been caused by // parquet-mr during the same time as PARQUET-251, see PARQUET-297 @@ -121,7 +118,7 @@ public static boolean shouldIgnoreStatistics(String createdBy, PrimitiveTypeName // the bug only applies to binary columns return false; } - return fileHasCorruptStatistics(createdBy); + return mayHaveCorruptStatistics(createdBy); } private static void warnParseErrorOnce(String createdBy, Throwable e) { diff --git a/parquet-column/src/test/java/org/apache/parquet/CorruptStatisticsTest.java b/parquet-column/src/test/java/org/apache/parquet/CorruptStatisticsTest.java index 4287c11e73..174b3db64a 100644 --- a/parquet-column/src/test/java/org/apache/parquet/CorruptStatisticsTest.java +++ b/parquet-column/src/test/java/org/apache/parquet/CorruptStatisticsTest.java @@ -126,32 +126,35 @@ public void testDistributionCorruptStatistics() { } @Test - public void testIsCorruptStatisticsColumnType() { - assertTrue(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.BINARY)); - assertTrue(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY)); - assertFalse(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.INT32)); - assertFalse(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.INT64)); - assertFalse(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.DOUBLE)); - assertFalse(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.FLOAT)); - assertFalse(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.BOOLEAN)); - assertFalse(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.INT96)); + public void testCorruptStatisticsColumnType() { + // These column types are affected by the PARQUET-251 bug + String corruptVersion = "parquet-mr version 1.6.0 (build abcd)"; + assertTrue(CorruptStatistics.shouldIgnoreStatistics(corruptVersion, PrimitiveTypeName.BINARY)); + assertTrue(CorruptStatistics.shouldIgnoreStatistics(corruptVersion, PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY)); + // These column types are NOT affected + assertFalse(CorruptStatistics.shouldIgnoreStatistics(corruptVersion, PrimitiveTypeName.INT32)); + assertFalse(CorruptStatistics.shouldIgnoreStatistics(corruptVersion, PrimitiveTypeName.INT64)); + assertFalse(CorruptStatistics.shouldIgnoreStatistics(corruptVersion, PrimitiveTypeName.DOUBLE)); + assertFalse(CorruptStatistics.shouldIgnoreStatistics(corruptVersion, PrimitiveTypeName.FLOAT)); + assertFalse(CorruptStatistics.shouldIgnoreStatistics(corruptVersion, PrimitiveTypeName.BOOLEAN)); + assertFalse(CorruptStatistics.shouldIgnoreStatistics(corruptVersion, PrimitiveTypeName.INT96)); } @Test - public void testFileHasCorruptStatistics() { + public void testMayHaveCorruptStatistics() { // Corrupt versions - assertTrue(CorruptStatistics.fileHasCorruptStatistics("parquet-mr version 1.6.0 (build abcd)")); - assertTrue(CorruptStatistics.fileHasCorruptStatistics("parquet-mr version 1.4.2 (build abcd)")); - assertTrue(CorruptStatistics.fileHasCorruptStatistics("parquet-mr version 1.7.999 (build abcd)")); + assertTrue(CorruptStatistics.mayHaveCorruptStatistics("parquet-mr version 1.6.0 (build abcd)")); + assertTrue(CorruptStatistics.mayHaveCorruptStatistics("parquet-mr version 1.4.2 (build abcd)")); + assertTrue(CorruptStatistics.mayHaveCorruptStatistics("parquet-mr version 1.7.999 (build abcd)")); // Null/empty - assertTrue(CorruptStatistics.fileHasCorruptStatistics(null)); - assertTrue(CorruptStatistics.fileHasCorruptStatistics("")); + assertTrue(CorruptStatistics.mayHaveCorruptStatistics(null)); + assertTrue(CorruptStatistics.mayHaveCorruptStatistics("")); // Unparseable - assertTrue(CorruptStatistics.fileHasCorruptStatistics("unparseable string")); + assertTrue(CorruptStatistics.mayHaveCorruptStatistics("unparseable string")); // Fixed versions - assertFalse(CorruptStatistics.fileHasCorruptStatistics("parquet-mr version 1.8.0 (build abcd)")); - assertFalse(CorruptStatistics.fileHasCorruptStatistics("parquet-mr version 2.0.0 (build abcd)")); + assertFalse(CorruptStatistics.mayHaveCorruptStatistics("parquet-mr version 1.8.0 (build abcd)")); + assertFalse(CorruptStatistics.mayHaveCorruptStatistics("parquet-mr version 2.0.0 (build abcd)")); // Non-parquet-mr applications - assertFalse(CorruptStatistics.fileHasCorruptStatistics("impala version 1.6.0 (build abcd)")); + assertFalse(CorruptStatistics.mayHaveCorruptStatistics("impala version 1.6.0 (build abcd)")); } } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 349d70e01b..2e611fd67b 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -947,7 +947,7 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist // Visible for testing static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal( String createdBy, Statistics formatStats, PrimitiveType type, SortOrder typeSortOrder) { - boolean fileHasCorruptStats = CorruptStatistics.fileHasCorruptStatistics(createdBy); + boolean fileHasCorruptStats = CorruptStatistics.mayHaveCorruptStatistics(createdBy); return fromParquetStatisticsInternal(formatStats, type, typeSortOrder, fileHasCorruptStats); } @@ -977,8 +977,9 @@ static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInte // aggregated using a signed byte-wise ordering, which isn't valid for all the // types (e.g. strings, decimals etc.). // The fileHasCorruptStats flag applies only to BINARY and FIXED_LEN_BYTE_ARRAY columns. - boolean ignoreForThisColumn = - fileHasCorruptStats && CorruptStatistics.isCorruptStatisticsColumnType(type.getPrimitiveTypeName()); + boolean ignoreForThisColumn = fileHasCorruptStats + && (type.getPrimitiveTypeName() == PrimitiveTypeName.BINARY + || type.getPrimitiveTypeName() == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY); if (!ignoreForThisColumn && (sortOrdersMatch || maxEqualsMin)) { if (isSet) { statsBuilder.withMin(formatStats.min.array()); @@ -1827,15 +1828,12 @@ public FileMetaDataAndRowGroupOffsetInfo visit(RangeMetadataFilter filter) throw public ColumnChunkMetaData buildColumnChunkMetaData( ColumnMetaData metaData, ColumnPath columnPath, PrimitiveType type, String createdBy) { - boolean fileHasCorruptStats = CorruptStatistics.fileHasCorruptStatistics(createdBy); + boolean fileHasCorruptStats = CorruptStatistics.mayHaveCorruptStatistics(createdBy); return buildColumnChunkMetaData(metaData, columnPath, type, fileHasCorruptStats); } ColumnChunkMetaData buildColumnChunkMetaData( - ColumnMetaData metaData, - ColumnPath columnPath, - PrimitiveType type, - boolean fileHasCorruptStats) { + ColumnMetaData metaData, ColumnPath columnPath, PrimitiveType type, boolean fileHasCorruptStats) { SortOrder typeSortOrder = overrideSortOrderToSigned(type) ? SortOrder.SIGNED : sortOrder(type); return ColumnChunkMetaData.get( columnPath, @@ -1876,7 +1874,7 @@ public ParquetMetadata fromParquetMetadata( // Only parse created_by if the schema has columns affected by the bug (BINARY/FIXED_LEN_BYTE_ARRAY). // The per-column type check is applied later when statistics are actually read. boolean fileHasCorruptStats = schemaHasCorruptStatisticsColumnType(messageType) - && CorruptStatistics.fileHasCorruptStatistics(parquetMetadata.getCreated_by()); + && CorruptStatistics.mayHaveCorruptStatistics(parquetMetadata.getCreated_by()); if (row_groups != null) { for (RowGroup rowGroup : row_groups) { @@ -2041,7 +2039,8 @@ private static ColumnPath getPath(ColumnMetaData metaData) { */ private static boolean schemaHasCorruptStatisticsColumnType(MessageType schema) { for (ColumnDescriptor column : schema.getColumns()) { - if (CorruptStatistics.isCorruptStatisticsColumnType(column.getPrimitiveType().getPrimitiveTypeName())) { + PrimitiveTypeName typeName = column.getPrimitiveType().getPrimitiveTypeName(); + if (typeName == PrimitiveTypeName.BINARY || typeName == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) { return true; } } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index bdb9325324..d8a6521e57 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -2158,7 +2158,9 @@ public void testColumnIndexNanCountsRoundTrip() { ColumnIndex roundTrip = ParquetMetadataConverter.fromParquetColumnIndex(type, parquetColumnIndex); assertNotNull(roundTrip); assertEquals(List.of(1L, 0L, 0L), roundTrip.getNanCounts()); + } + @Test public void testCorruptStatsPerColumnGate() { // A created_by string from a version known to have the PARQUET-251 bug String corruptCreatedBy = "parquet-mr version 1.6.0 (build abcd)"; @@ -2220,8 +2222,10 @@ public void testSchemaGateSkipsCorruptStatsCheckForNonBinarySchema() throws Exce String corruptCreatedBy = "parquet-mr version 1.6.0 (build abcd)"; MessageType intOnlySchema = Types.buildMessage() - .required(PrimitiveTypeName.INT32).named("id") - .required(PrimitiveTypeName.INT64).named("ts") + .required(PrimitiveTypeName.INT32) + .named("id") + .required(PrimitiveTypeName.INT64) + .named("ts") .named("msg"); ParquetMetadataConverter converter = new ParquetMetadataConverter(); @@ -2238,7 +2242,10 @@ public void testSchemaGateSkipsCorruptStatsCheckForNonBinarySchema() throws Exce Collections.emptyList(), Collections.singletonList("id"), UNCOMPRESSED, - 100L, 200L, 100L, 0L); + 100L, + 200L, + 100L, + 0L); cmd.setStatistics(stats); ColumnChunk cc = new ColumnChunk(0L);