forked from danielbeach/data-engineering-practice
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsplit
63 lines (48 loc) · 1.56 KB
/
split
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.functions._
object Main extends App {
// Create SparkSession
val spark = SparkSession.builder()
.appName("DataFrameManipulation")
.master("local[*]")
.getOrCreate()
// Dummy data
val data = Seq(
("USER Apple is great"),
("USER Banana is tasty"),
("Orange is healthy")
)
// Create DataFrame
val df = spark.createDataFrame(data).toDF("A")
// DataFrame Transformation
val dfWithSplit = df.withColumn("C", when(col("A").contains("USER Apple"), "USER Apple").otherwise(split(col("A"), " ").getItem(0)))
.withColumn("D", when(col("A").contains("USER Apple"), substring_index(col("A"), "USER Apple ", -1)).otherwise(substring_index(col("A"), " ", -1)))
// Show result
dfWithSplit.show()
// Stop SparkSession
spark.stop()
}
import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.functions._
object Main extends App {
// Create SparkSession
val spark = SparkSession.builder()
.appName("DataFrameManipulation")
.master("local[*]")
.getOrCreate()
// Dummy data
val data = Seq(
("USER Apple is great"),
("USER Banana is tasty"),
("Orange is healthy")
)
// Create DataFrame
val df = data.toDF("A")
// DataFrame Transformation
val dfWithRegex = df.withColumn("C", regexp_extract(col("A"), "USER\\s+(\\w+)", 1))
.withColumn("D", regexp_replace(col("A"), "USER\\s+\\w+\\s*", ""))
// Show result
dfWithRegex.show()
// Stop SparkSession
spark.stop()
}