apache · zachdisc · Oct 14, 2024 · Oct 14, 2024 · Oct 14, 2024 · Oct 14, 2024
diff --git a/api/src/main/java/org/apache/iceberg/actions/RewriteManifests.java b/api/src/main/java/org/apache/iceberg/actions/RewriteManifests.java
@@ -18,7 +18,10 @@
  */
 package org.apache.iceberg.actions;
 
+import java.util.List;
+import java.util.function.Function;
 import java.util.function.Predicate;
+import org.apache.iceberg.DataFile;
 import org.apache.iceberg.ManifestFile;
 
 /** An action that rewrites manifests. */
@@ -44,6 +47,39 @@ public interface RewriteManifests
    */
   RewriteManifests rewriteIf(Predicate<ManifestFile> predicate);
 
+  /**
+   * Rewrite manifests in a given order, based on partition field names
+   *
+   * <p>Supply an optional set of partition field names to sort the rewritten manifests by. Expects
+   * exact transformed column names used for partitioning; not the raw columnnames that partitions
+   * are derived from. E.G. supply 'data_bucket' and not 'data' for a bucket(N, data) partition
+   * definition
+   *
+   * <p>If not set, manifests will be rewritten in the order of the transforms in the table's
+   * current partition spec.
+   *
+   * @param partitionFieldSortOrder a list of partition field names
+   * @return this for method chaining
+   */
+  default RewriteManifests sort(List<String> partitionFieldSortOrder) {
+    throw new UnsupportedOperationException(
+        this.getClass().getName() + " doesn't implement sort(List<String>)");
+  }
+
+  /**
+   * Rewrite manifests in a given order, dictated by a custom Function
+   *
+   * <p>Supply a Function which will apply its own custom clustering logic based on supplied {@link
+   * org.apache.iceberg.DataFile} attributes.
+   *
+   * @param sortStrategyFunction A Function that returns a String to be used for manifest clustering
+   * @return this method for chaining
+   */
+  default RewriteManifests sort(Function<DataFile, String> sortStrategyFunction) {
+    throw new UnsupportedOperationException(
+        this.getClass().getName() + " doesn't implement sort(Function<DataFile, String>)");
+  }
+
   /**
    * Passes a location where the staged manifests should be written.
    *

diff --git a/...3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteManifestsSparkAction.java b/...3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteManifestsSparkAction.java
@@ -19,10 +19,12 @@
 package org.apache.iceberg.spark.actions;
 
 import static org.apache.iceberg.MetadataTableType.ENTRIES;
+import static org.apache.spark.sql.functions.col;
 
 import java.io.Serializable;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Set;
 import java.util.UUID;
 import java.util.function.Function;
 import java.util.function.Predicate;
@@ -37,6 +39,7 @@
 import org.apache.iceberg.ManifestFile;
 import org.apache.iceberg.ManifestFiles;
 import org.apache.iceberg.ManifestWriter;
+import org.apache.iceberg.PartitionField;
 import org.apache.iceberg.PartitionSpec;
 import org.apache.iceberg.Partitioning;
 import org.apache.iceberg.RollingManifestWriter;
@@ -70,6 +73,10 @@
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.api.java.UDF1;
+import org.apache.spark.sql.expressions.UserDefinedFunction;
+import org.apache.spark.sql.functions;
+import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.types.StructType;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -97,6 +104,8 @@ public class RewriteManifestsSparkAction
           .addedManifests(ImmutableList.of())
           .build();
 
+  private static final String CUSTOM_CLUSTERING_COLUMN_NAME = "__clustering_column__";
+
   private final Table table;
   private final int formatVersion;
   private final long targetManifestSizeBytes;
@@ -106,6 +115,9 @@ public class RewriteManifestsSparkAction
   private Predicate<ManifestFile> predicate = manifest -> true;
   private String outputLocation = null;
 
+  private List<String> partitionFieldSortOrder = null;
+  private Function<DataFile, String> partitionSortFunction = null;
+
   RewriteManifestsSparkAction(SparkSession spark, Table table) {
     super(spark);
     this.table = table;
@@ -160,6 +172,32 @@ public RewriteManifestsSparkAction stagingLocation(String newStagingLocation) {
     return this;
   }
 
+  @Override
+  public RewriteManifestsSparkAction sort(List<String> sortOrder) {
+    // Collect set of allowable partition columns to sort on
+    Set<String> availablePartitionNames =
+        spec.fields().stream().map(PartitionField::name).collect(Collectors.toSet());
+
+    // Check if these partition fields are included in the spec
+    Preconditions.checkArgument(
+        sortOrder.stream().allMatch(availablePartitionNames::contains),
+        "Cannot use custom sort order to rewrite manifests '%s'. All partition columns must be "
+            + "defined in the current partition spec: %s. Choose from the available partitionable columns: %s",
+        sortOrder,
+        this.spec.specId(),
+        availablePartitionNames);
+
+    this.partitionFieldSortOrder = sortOrder;
+    return this;
+  }
+
+  @Override
+  public RewriteManifests sort(Function<DataFile, String> partitionFieldsSortStrategy) {
+    this.partitionSortFunction =
+        (Function<DataFile, String> & Serializable) partitionFieldsSortStrategy;
+    return this;
+  }
+
   @Override
   public RewriteManifests.Result execute() {
     String desc = String.format("Rewriting manifests in %s", table.name());
@@ -250,12 +288,59 @@ private List<ManifestFile> writeUnpartitionedManifests(
   private List<ManifestFile> writePartitionedManifests(
       ManifestContent content, Dataset<Row> manifestEntryDF, int numManifests) {
 
+    // Extract desired clustering/sorting criteria into a dedicated column
+    Dataset<Row> clusteredManifestEntryDF;
+
+    if (partitionSortFunction != null) {
+      LOG.info(
+          "Sorting manifests for specId {} using custom sorting function",
+          spec.specId(),
+          partitionSortFunction);
+      Types.StructType partitionType = DataFile.getType(table.spec().partitionType());
+      StructType dataFileSchema = manifestEntryDF.select("data_file.*").schema();
+
+      // Create a UDF to wrap the custom partitionSortFunction call
+      UserDefinedFunction clusteringUdf =
+          functions.udf(
+              new CustomDataFileSorterUdf(
+                  this.partitionSortFunction, partitionType, dataFileSchema),
+              DataTypes.StringType);
+      // Apply supplied partitionSortFunction function to the data_file datums within this dataframe
+      // The results are stored as a String in the new __clustering_column__
+      clusteredManifestEntryDF =
+          manifestEntryDF.withColumn(
+              CUSTOM_CLUSTERING_COLUMN_NAME, clusteringUdf.apply(col("data_file")));
+    } else if (partitionFieldSortOrder != null) {
+      LOG.info(
+          "Sorting manifests for specId {} by partition columns in order of {} ",
+          spec.specId(),
+          partitionFieldSortOrder);
+
+      // Map the top level partition column names to the column name referenced within the manifest
+      // entry dataframe
+      Column[] actualPartitionColumns =
+          partitionFieldSortOrder.stream()
+              .map(p -> col("data_file.partition." + p))
+              .toArray(Column[]::new);
+
+      // Form a new temporary column to sort/cluster manifests on, based on the custom sort
+      // order provided
+      clusteredManifestEntryDF =
+          manifestEntryDF.withColumn(
+              CUSTOM_CLUSTERING_COLUMN_NAME, functions.struct(actualPartitionColumns));
+    } else {
+      clusteredManifestEntryDF =
+          manifestEntryDF.withColumn(CUSTOM_CLUSTERING_COLUMN_NAME, col("data_file.partition"));
+    }
+
     return withReusableDS(
-        manifestEntryDF,
+        clusteredManifestEntryDF,
         df -> {
           WriteManifests<?> writeFunc = newWriteManifestsFunc(content, df.schema());
-          Column partitionColumn = df.col("data_file.partition");
-          Dataset<Row> transformedDF = repartitionAndSort(df, partitionColumn, numManifests);
+          Column partitionColumn = df.col(CUSTOM_CLUSTERING_COLUMN_NAME);
+          Dataset<Row> transformedDF =
+              repartitionAndSort(df, partitionColumn, numManifests)
+                  .drop(CUSTOM_CLUSTERING_COLUMN_NAME);
           return writeFunc.apply(transformedDF).collectAsList();
         });
   }
@@ -544,4 +629,27 @@ private Table table() {
       return tableBroadcast.value();
     }
   }
+
+  // UDF that will execute supplied custom manifest clustering function
+  static class CustomDataFileSorterUdf implements UDF1<Row, String>, Serializable {
+    // Supply how the DataFile should be interpreted from a raw Row.
+    private Types.StructType dataFileType;
+    private StructType dataFileSparkType;
+    private Function<DataFile, String> clusteringFunction;
+
+    CustomDataFileSorterUdf(
+        Function<DataFile, String> clusteringFunction,
+        Types.StructType dataFileType,
+        StructType dataFileSparkType) {
+      this.dataFileType = dataFileType;
+      this.dataFileSparkType = dataFileSparkType;
+      this.clusteringFunction = (Function<DataFile, String> & Serializable) clusteringFunction;
+    }
+
+    @Override
+    public String call(Row dataFile) throws Exception {
+      SparkDataFile wrapper = new SparkDataFile(dataFileType, dataFileSparkType);
+      return this.clusteringFunction.apply(wrapper.wrap(dataFile));
+    }
+  }
 }