这是indexloc提供的服务,不要输入任何密码
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions apache-spark-2/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<artifactId>apache-spark-2</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<name>apache-spark</name>

<parent>
<groupId>com.baeldung</groupId>
<artifactId>parent-modules</artifactId>
<version>1.0.0-SNAPSHOT</version>
</parent>

<dependencies>
<dependency>
<groupId>io.delta</groupId>
<artifactId>delta-core_2.12</artifactId>
<version>${delta-core.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>${org.apache.spark.spark-core.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>${org.apache.spark.spark-sql.version}</version>
</dependency>
</dependencies>

<build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.3.0</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</plugin>
</plugins>
</build>

<repositories>
<repository>
<id>SparkPackagesRepo</id>
<url>https://repos.spark-packages.org</url>
</repository>
</repositories>

<properties>
<delta-core.version>2.4.0</delta-core.version>
<org.apache.spark.spark-core.version>3.4.0</org.apache.spark.spark-core.version>
<org.apache.spark.spark-sql.version>3.4.0</org.apache.spark.spark-sql.version>
<maven-assembly-plugin.version>3.3.0</maven-assembly-plugin.version>
</properties>
</project>
60 changes: 60 additions & 0 deletions apache-spark-2/src/main/java/com/baeldung/delta/DeltaLake.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package com.baeldung.delta;

import org.apache.spark.sql.*;
import java.io.Serializable;
import java.nio.file.Files;

public class DeltaLake {
public static SparkSession createSession() {
return SparkSession.builder()
.appName("DeltaLake")
.master("local[*]")
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
.getOrCreate();
}

public static String preparePeopleTable(SparkSession spark) {
try {
String tablePath = Files.createTempDirectory("delta-table-").toAbsolutePath().toString();

Dataset<Row> data = spark.createDataFrame(
java.util.Arrays.asList(
new Person(1, "Alice"),
new Person(2, "Bob")
),
Person.class
);

data.write().format("delta").mode("overwrite").save(tablePath);
spark.sql("DROP TABLE IF EXISTS people");
spark.sql("CREATE TABLE IF NOT EXISTS people USING DELTA LOCATION '" + tablePath + "'");
return tablePath;
} catch (Exception e) {
throw new RuntimeException(e);
}
}

public static void cleanupPeopleTable(SparkSession spark) {
spark.sql("DROP TABLE IF EXISTS people");
}

public static void stopSession(SparkSession spark) {
if (spark != null) {
spark.stop();
}
}

public static class Person implements Serializable {
private int id;
private String name;

public Person() {}
public Person(int id, String name) { this.id = id; this.name = name; }

public int getId() { return id; }
public void setId(int id) { this.id = id; }
public String getName() { return name; }
public void setName(String name) { this.name = name; }
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package com.baeldung.delta;

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

public class DeltaLakeUnitTest {

private static SparkSession spark;
private static String tablePath;

@BeforeAll
static void setUp() {
spark = DeltaLake.createSession();
tablePath = DeltaLake.preparePeopleTable(spark);
}

@AfterAll
static void tearDown() {
try {
DeltaLake.cleanupPeopleTable(spark);
} finally {
DeltaLake.stopSession(spark);
}
}

@Test
void givenDeltaLake_whenUsingDeltaFormat_thenPrintAndValidate() {
Dataset<Row> df = spark.sql("DESCRIBE DETAIL people");
df.show(false);

Row row = df.first();
assertEquals("file:"+tablePath, row.getAs("location"));
assertEquals("delta", row.getAs("format"));
assertTrue(row.<Long>getAs("numFiles") >= 1);
}
}