Skip to content

Update plugin to be compatible with embulk 0.11.5 #60

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 42 additions & 21 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -2,56 +2,55 @@ plugins {
id "scala"
id "maven-publish"
id "com.diffplug.gradle.spotless" version "3.27.1"
id "org.embulk.embulk-plugins" version "0.4.1"
id "org.embulk.embulk-plugins" version "0.7.0"
}

repositories {
mavenCentral()
jcenter()
}

group = "pro.civitaspo"
version = "0.5.3"
description = "Dumps records to S3 Parquet."
version = "0.6.0"

sourceCompatibility = 1.8
targetCompatibility = 1.8
sourceCompatibility = 17
targetCompatibility = 17

dependencies {
compileOnly "org.embulk:embulk-core:0.9.23"
compileOnly "org.embulk:embulk-core:0.11.5"
compileOnly 'org.msgpack:msgpack-core:0.8.16'
compileOnly "org.embulk:embulk-spi:0.11"
compileOnly "org.embulk:embulk-api:0.10.43"
// NOTE: Is shadow plugin required in the future?
compile "org.scala-lang:scala-library:2.13.1"

implementation "org.scala-lang:scala-library:2.13.1"
implementation 'com.google.guava:guava:30.1.1-jre'
['glue', 's3', 'sts'].each { v ->
compile("com.amazonaws:aws-java-sdk-${v}:1.11.769") {
implementation("com.amazonaws:aws-java-sdk-${v}:1.11.769") {
exclude group: 'joda-time', module: 'joda-time'
exclude group: 'com.fasterxml.jackson.core', module: 'jackson-annotations'
exclude group: 'com.fasterxml.jackson.core', module: 'jackson-databind'
exclude group: 'com.fasterxml.jackson.core', module: 'jackson-core'
}
}
// Add Jackson dependencies explicitly
implementation 'com.fasterxml.jackson.core:jackson-core:2.14.0'
implementation 'com.fasterxml.jackson.core:jackson-databind:2.14.0'
implementation 'com.fasterxml.jackson.core:jackson-annotations:2.14.0'
['column', 'common', 'encoding', 'hadoop', 'jackson'].each { v ->
compile("org.apache.parquet:parquet-${v}:1.11.0") {
implementation("org.apache.parquet:parquet-${v}:1.11.0") {
exclude group: 'org.slf4j', module: 'slf4j-api'
}
}
// ref. https://github.com/apache/parquet-mr/blob/apache-parquet-1.11.0/pom.xml#L85
compile('org.apache.parquet:parquet-format:2.7.0') {
implementation('org.apache.parquet:parquet-format:2.7.0') {
exclude group: 'org.slf4j', module: 'slf4j-api'
}
compile('org.apache.hadoop:hadoop-common:2.10.2') {
implementation('org.apache.hadoop:hadoop-common:2.10.2') {
exclude group: 'org.slf4j', module: 'slf4j-api'
exclude group: 'com.google.guava', module: 'guava'
exclude group: 'commons-beanutils', module: 'commons-beanutils-core'
exclude group: 'org.apache.commons', module: 'commons-lang3'
exclude group: 'com.fasterxml.jackson.core', module: 'jackson-core'
}
compile 'org.xerial.snappy:snappy-java:1.1.7.3'
implementation 'org.xerial.snappy:snappy-java:1.1.7.3'

['core', 'standards', 'deps-buffer', 'deps-config'].each { v ->
testImplementation "org.embulk:embulk-${v}:0.9.23"
}
testImplementation "org.embulk:embulk-core:0.9.23:tests"
testImplementation "org.embulk:embulk-core:0.11.5"
testImplementation "org.scalatest:scalatest_2.13:3.1.1"
testImplementation 'org.apache.parquet:parquet-avro:1.11.0'
testImplementation 'com.fasterxml.jackson.dataformat:jackson-dataformat-avro:2.14.0'
Expand Down Expand Up @@ -101,3 +100,25 @@ task scalatest(dependsOn: ['testClasses'], type: JavaExec) {
args = ['-R', 'build/classes/scala/test', '-o']
classpath = sourceSets.test.runtimeClasspath
}

tasks.withType(JavaExec) {
jvmArgs += [
'--add-opens', 'java.base/sun.nio.ch=ALL-UNNAMED',
'--add-opens', 'java.base/java.io=ALL-UNNAMED',
'--add-opens', 'java.base/java.nio=ALL-UNNAMED',
'--add-opens', 'java.base/java.util=ALL-UNNAMED',
'--add-opens', 'java.base/java.lang=ALL-UNNAMED',
'--add-opens', 'java.base/java.lang.reflect=ALL-UNNAMED'
]
}

tasks.withType(Test) {
jvmArgs += [
'--add-opens', 'java.base/sun.nio.ch=ALL-UNNAMED',
'--add-opens', 'java.base/java.io=ALL-UNNAMED',
'--add-opens', 'java.base/java.nio=ALL-UNNAMED',
'--add-opens', 'java.base/java.util=ALL-UNNAMED',
'--add-opens', 'java.base/java.lang=ALL-UNNAMED',
'--add-opens', 'java.base/java.lang.reflect=ALL-UNNAMED'
]
}
91 changes: 0 additions & 91 deletions gradle/dependency-locks/embulkPluginRuntime.lockfile

This file was deleted.

2 changes: 1 addition & 1 deletion gradle/wrapper/gradle-wrapper.properties
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-6.3-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-7.6-bin.zip
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import org.embulk.output.s3_parquet.catalog.GlueDataType
import org.embulk.output.s3_parquet.implicits
import org.embulk.spi.{Column, DataException, Exec}
import org.embulk.spi.time.{Timestamp, TimestampFormatter}
import org.embulk.spi.time.TimestampFormatter.TimestampColumnOption
import org.msgpack.value.Value
import org.slf4j.{Logger, LoggerFactory}

Expand All @@ -37,7 +36,7 @@ object ParquetColumnType {
private val logger: Logger =
LoggerFactory.getLogger(classOf[ParquetColumnType])

trait Task extends EmbulkTask with TimestampColumnOption {
trait Task extends EmbulkTask {
@Config("logical_type")
@ConfigDefault("null")
def getLogicalType: Optional[LogicalTypeOption]
Expand Down Expand Up @@ -242,7 +241,7 @@ object ParquetColumnType {
isSigned = o.getIsSigned,
isAdjustedToUtc = o.getIsAdjustedToUtc,
timeUnit = o.getTimeUnit,
timeZone = task.getTimeZoneId.map(ZoneId.of)
timeZone = Some(ZoneId.of("UTC"))
)
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ object ParquetFileWriteSupport {
def getColumnOptions: JMap[String, ParquetColumnType.Task]
def setColumnOptions(
columnOptions: JMap[String, ParquetColumnType.Task]
): Unit
): Unit

@Config("type_options")
@ConfigDefault("{}")
Expand All @@ -52,7 +52,7 @@ object ParquetFileWriteSupport {
override def self(): WriterBuilder = this
override def getWriteSupport(
conf: Configuration
): WriteSupport[PageReader] = writeSupport
): WriteSupport[PageReader] = writeSupport
}

def configure(task: Task): Unit = {
Expand All @@ -72,22 +72,7 @@ object ParquetFileWriteSupport {

task.getTypeOptions.keys.foreach(
embulkType
) // throw ConfigException if unknown type name is found.

task.getColumnOptions.foreach {
case (c: String, t: ParquetColumnType.Task) =>
val column: Column = schema.lookupColumn(c) // throw ConfigException if columnName does not exist.

if (t.getFormat.isDefined || t.getTimeZoneId.isDefined) {
if (!column.getType.isInstanceOf[TimestampType]) {
// NOTE: Warning is better instead of throwing.
throw new ConfigException(
s"The type of column{name:${column.getName},type:${column.getType.getName}} is not 'timestamp'," +
" but timestamp options (\"format\" or \"timezone\") are set."
)
}
}
}
) // throw ConfigException if unknown type name is found.
}

private def embulkType(typeName: String): Type = {
Expand All @@ -104,19 +89,28 @@ object ParquetFileWriteSupport {
throw new ConfigException(s"Unknown embulk type: $typeName.")
}

private def createTimestampFormatters(task: Task, schema: Schema): Seq[TimestampFormatter] = {
(0 until schema.getColumnCount).map { i =>
val column = schema.getColumn(i)
if (column.getType.isInstanceOf[TimestampType]) {
TimestampFormatter.of(task.getDefaultTimestampFormat(), "UTC")
} else
null
}
}

def apply(task: Task, schema: Schema): ParquetFileWriteSupport = {
validateTask(task, schema)

val parquetSchema: Map[Column, ParquetColumnType] = schema.getColumns.map {
c: Column =>
c -> task.getColumnOptions.toMap
.get(c.getName)
.orElse(task.getTypeOptions.toMap.get(c.getType.getName))
.flatMap(ParquetColumnType.fromTask)
.getOrElse(DefaultColumnType)
c -> task.getColumnOptions.toMap
.get(c.getName)
.orElse(task.getTypeOptions.toMap.get(c.getType.getName))
.flatMap(ParquetColumnType.fromTask)
.getOrElse(DefaultColumnType)
}.toMap
val timestampFormatters: Seq[TimestampFormatter] = Timestamps
.newTimestampColumnFormatters(task, schema, task.getColumnOptions)
val timestampFormatters: Seq[TimestampFormatter] = createTimestampFormatters(task, schema)
new ParquetFileWriteSupport(schema, parquetSchema, timestampFormatters)
}
}
Expand Down Expand Up @@ -157,7 +151,7 @@ case class ParquetFileWriteSupport private (
schema.visitColumns(new ColumnVisitor {
override def booleanColumn(column: Column): Unit = nullOr(column) {
parquetSchema(column)
.consumeBoolean(current, record.getBoolean(column))
.consumeBoolean(current, record.getBoolean(column))
}
override def longColumn(column: Column): Unit = nullOr(column) {
parquetSchema(column).consumeLong(current, record.getLong(column))
Expand Down Expand Up @@ -196,6 +190,6 @@ case class ParquetFileWriteSupport private (
current.endField(column.getName, column.getIndex)
}

def newWriterBuilder(pathString: String): WriterBuilder =
WriterBuilder(new Path(pathString), this)
def newWriterBuilder(pathString: String): ParquetFileWriteSupport.WriterBuilder =
ParquetFileWriteSupport.WriterBuilder(new Path(pathString), this)
}