Skip to content

Commit

Permalink
revert to previous approach
Browse files Browse the repository at this point in the history
  • Loading branch information
huaxingao committed Jan 10, 2025
1 parent 942d24f commit 3246521
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 25 deletions.
74 changes: 50 additions & 24 deletions parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.function.BiConsumer;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
Expand Down Expand Up @@ -95,6 +96,7 @@
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.ArrayUtil;
import org.apache.iceberg.util.ByteBuffers;
import org.apache.iceberg.util.PropertyUtil;
Expand All @@ -115,8 +117,12 @@
import org.apache.parquet.hadoop.api.WriteSupport;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.schema.MessageType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class Parquet {
private static final Logger LOG = LoggerFactory.getLogger(Parquet.class);

private Parquet() {}

private static final Collection<String> READ_PROPERTIES_TO_REMOVE =
Expand Down Expand Up @@ -266,6 +272,43 @@ private WriteBuilder createContextFunc(
return this;
}

private <T> void setBloomFilterConfig(
Context context,
MessageType parquetSchema,
BiConsumer<String, Boolean> withBloomFilterEnabled,
BiConsumer<String, Double> withBloomFilterFPP) {

Map<Integer, String> fieldIdToParquetPath =
parquetSchema.getColumns().stream()
.collect(
Collectors.toMap(
col -> col.getPrimitiveType().getId().intValue(),
col -> String.join(".", col.getPath())));

context
.columnBloomFilterEnabled()
.forEach(
(colPath, isEnabled) -> {
Types.NestedField fieldId = schema.findField(colPath);
if (fieldId == null) {
LOG.warn("Skipping bloom filter config for missing field: {}", colPath);
return;
}

String parquetColumnPath = fieldIdToParquetPath.get(fieldId.fieldId());
if (parquetColumnPath == null) {
LOG.warn("Skipping bloom filter config for missing field: {}", fieldId);
return;
}

withBloomFilterEnabled.accept(parquetColumnPath, Boolean.valueOf(isEnabled));
String fpp = context.columnBloomFilterFpp().get(colPath);
if (fpp != null) {
withBloomFilterFPP.accept(parquetColumnPath, Double.parseDouble(fpp));
}
});
}

public <D> FileAppender<D> build() throws IOException {
Preconditions.checkNotNull(schema, "Schema is required");
Preconditions.checkNotNull(name, "Table name is required and cannot be null");
Expand All @@ -285,8 +328,6 @@ public <D> FileAppender<D> build() throws IOException {
int rowGroupCheckMinRecordCount = context.rowGroupCheckMinRecordCount();
int rowGroupCheckMaxRecordCount = context.rowGroupCheckMaxRecordCount();
int bloomFilterMaxBytes = context.bloomFilterMaxBytes();
Map<String, String> columnBloomFilterFpp = context.columnBloomFilterFpp();
Map<String, String> columnBloomFilterEnabled = context.columnBloomFilterEnabled();
boolean dictionaryEnabled = context.dictionaryEnabled();

if (compressionLevel != null) {
Expand Down Expand Up @@ -343,17 +384,8 @@ public <D> FileAppender<D> build() throws IOException {
.withMaxRowCountForPageSizeCheck(rowGroupCheckMaxRecordCount)
.withMaxBloomFilterBytes(bloomFilterMaxBytes);

for (Map.Entry<String, String> entry : columnBloomFilterEnabled.entrySet()) {
String colPath = AvroSchemaUtil.makeCompatibleName(entry.getKey());
String bloomEnabled = entry.getValue();
propsBuilder.withBloomFilterEnabled(colPath, Boolean.parseBoolean(bloomEnabled));
}

for (Map.Entry<String, String> entry : columnBloomFilterFpp.entrySet()) {
String colPath = AvroSchemaUtil.makeCompatibleName(entry.getKey());
String fpp = entry.getValue();
propsBuilder.withBloomFilterFPP(colPath, Double.parseDouble(fpp));
}
setBloomFilterConfig(
context, type, propsBuilder::withBloomFilterEnabled, propsBuilder::withBloomFilterFPP);

ParquetProperties parquetProperties = propsBuilder.build();

Expand Down Expand Up @@ -386,17 +418,11 @@ public <D> FileAppender<D> build() throws IOException {
.withDictionaryPageSize(dictionaryPageSize)
.withEncryption(fileEncryptionProperties);

for (Map.Entry<String, String> entry : columnBloomFilterEnabled.entrySet()) {
String colPath = AvroSchemaUtil.makeCompatibleName(entry.getKey());
String bloomEnabled = entry.getValue();
parquetWriteBuilder.withBloomFilterEnabled(colPath, Boolean.parseBoolean(bloomEnabled));
}

for (Map.Entry<String, String> entry : columnBloomFilterFpp.entrySet()) {
String colPath = AvroSchemaUtil.makeCompatibleName(entry.getKey());
String fpp = entry.getValue();
parquetWriteBuilder.withBloomFilterFPP(colPath, Double.parseDouble(fpp));
}
setBloomFilterConfig(
context,
type,
parquetWriteBuilder::withBloomFilterEnabled,
parquetWriteBuilder::withBloomFilterFPP);

return new ParquetWriteAdapter<>(parquetWriteBuilder.build(), metricsConfig);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ public void createInputFile() throws IOException {

// build struct field schema
org.apache.avro.Schema structSchema = AvroSchemaUtil.convert(UNDERSCORE_STRUCT_FIELD_TYPE);
String compatibleFieldName = AvroSchemaUtil.makeCompatibleName("_incompatible-name");
String compatibleFieldName = "_incompatible_x2Dname";

OutputFile outFile = Files.localOutput(temp);
try (FileAppender<Record> appender =
Expand Down

0 comments on commit 3246521

Please sign in to comment.