Configuring Apache Spark for Apache Iceberg

Apache Iceberg

spark-sql --packages org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:1.0.0\
--conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \
--conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \
--conf spark.sql.catalog.spark_catalog.type=hive \
--conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \
--conf spark.sql.catalog.local.type=hadoop \
--conf spark.sql.catalog.local.warehouse=$PWD/warehouse

Spark Configurations

Flags

--packages org.projectnessie:nessie-spark-3.2-extensions:0.43.0,org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.14.0,software.amazon.awssdk:bundle:2.17.178,software.amazon.awssdk:url-connection-client:2.17.178
--jars /home/docker/iceberg-spark-runtime-3.2_2.12-1.0.0.jar,/home/docker/nessie-spark-extensions-3.2_2.12-0.44.0.jar
--conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \
--conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \

Function Calls

val conf: SparkConf = new SparkConf()
conf.setAppName("yo")
conf.set("spark.jars.packages", "org.projectnessie:nessie-spark-3.2-extensions:0.43.0,org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.14.0,software.amazon.awssdk:bundle:2.17.178,software.amazon.awssdk:url-connection-client:2.17.178")
conf.set(spark.sql.catalog.spark_catalog","org.apache.iceberg.spark.SparkSessionCatalog")
conf = SparkConf()
conf.set(
"spark.jars.packages",
"org.projectnessie:nessie-spark-3.2-extensions:0.43.0,org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.14.0,software.amazon.awssdk:bundle:2.17.178,software.amazon.awssdk:url-connection-client:2.17.178",
)
conf.set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog")

Packages

Configurations

Amazon S3 Credentials

--conf spark.hadoop.fs.s3a.access.key=$AWS_ACCESS_KEY \
--conf spark.hadoop.fs.s3a.secret.key=$AWS_SECRET_ACCESS_KEY

Environmental Variables

export TOKEN=XXXXXX
export WAREHOUSE=s3a://.../
export AWS_ACCESS_KEY_ID=XXXXXX
export AWS_SECRET_ACCESS_KEY=XXXXXX
export URI=https://...
export AWS_REGION=us-east-1
export DB_USERNAME=xxxxxx
export DB_PASSWORD=xxxxx

Sample Configurations

docker run -it -p 8080:8080 --name spark-playground alexmerced/spark33playground
spark-sql --packages org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.0.0\
--conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \
--conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \
--conf spark.sql.catalog.spark_catalog.type=hive \
--conf spark.sql.catalog.hive=org.apache.iceberg.spark.SparkCatalog \
--conf spark.sql.catalog.hive.type=hadoop \
--conf spark.sql.catalog.hive.uri=$URI \
--conf spark.sql.catalog.hive.warehouse=$WAREHOUSE
spark-sql --packages "org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.0.0,software.amazon.awssdk:bundle:2.17.178,software.amazon.awssdk:url-connection-client:2.17.178" \
--conf spark.sql.catalog.glue=org.apache.iceberg.spark.SparkCatalog \
--conf spark.sql.catalog.glue.warehouse=$WAREHOUSE \
--conf spark.sql.catalog.glue.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog \
--conf spark.sql.catalog.glue.io-impl=org.apache.iceberg.aws.s3.S3FileIO \
--conf spark.hadoop.fs.s3a.access.key=$AWS_ACCESS_KEY \
--conf spark.hadoop.fs.s3a.secret.key=$AWS_SECRET_ACCESS_KEY
spark-sql --packages "org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.0.0,org.projectnessie:nessie-spark-extensions-3.3_2.12:0.44.0,software.amazon.awssdk:bundle:2.17.178,software.amazon.awssdk:url-connection-client:2.17.178" \
--conf spark.sql.extensions="org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions" \
--conf spark.sql.catalog.nessie.uri=$ARCTIC_URI \
--conf spark.sql.catalog.nessie.ref=main \
--conf spark.sql.catalog.nessie.authentication.type=BEARER \
--conf spark.sql.catalog.nessie.authentication.token=$TOKEN \
--conf spark.sql.catalog.nessie.catalog-impl=org.apache.iceberg.nessie.NessieCatalog \
--conf spark.sql.catalog.nessie.warehouse=$WAREHOUSE \
--conf spark.sql.catalog.nessie=org.apache.iceberg.spark.SparkCatalog \
--conf spark.sql.catalog.nessie.io-impl=org.apache.iceberg.aws.s3.S3FileIO
spark-sql --packages org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:1.0.0 \
--conf spark.sql.catalog.my_catalog=org.apache.iceberg.spark.SparkCatalog \
--conf spark.sql.catalog.jdbc.warehouse=$WAREHOUSE \
--conf spark.sql.catalog.jdbc.catalog-impl=org.apache.iceberg.jdbc.JdbcCatalog \
--conf spark.sql.catalog.jdbc.uri=$URI \
--conf spark.sql.catalog.jdbc.jdbc.verifyServerCertificate=true \
--conf spark.sql.catalog.jdbc.jdbc.useSSL=true \
--conf spark.sql.catalog.jdbc.jdbc.user=$DB_USERNAME \
--conf spark.sql.catalog.jdbc.jdbc.password=$DB_PASSWORD

Conclusion

--conf spark.sql.catalog.mycoolcatalog.catalog-impl=org.apache.iceberg.jdbc.JdbcCatalog \
CREATE TABLE mycoolcatalog.table1 (name STRING) USING iceberg;

--

--

Alex Merced is a Developer Advocate for Dremio and host of the Web Dev 101 and Datanation Podcasts.

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Alex Merced Coder

Alex Merced is a Developer Advocate for Dremio and host of the Web Dev 101 and Datanation Podcasts.