1. pyspark Installation
$ pip install pyspark==2.3.3
//https://mvnrepository.com/artifact/org.apache.kudu/kudu-spark2 에서 kudu버전에 맞는 jar 다운로드 후 로컬에 저장
$ wget https://repo1.maven.org/maven2/org/apache/kudu/kudu-spark2_2.11/1.10.0/kudu-spark2_2.11-1.10.0.jar
2. read & write with spark
from pyspark.sql import *
spark = SparkSession.builder.appName("KuduExample").getOrCreate()
## read from kudu
df = spark.read.format("org.apache.kudu.spark.kudu") \
.option('kudu.master', "${kudu_master}") \
.option('kudu.table', "${kudu_table}") \
.load()
## write to kudu
sourceDf.write.format('org.apache.kudu.spark.kudu') \
.option('kudu.master', "${kudu_master}") \
.option('kudu.table', "${kudu_table}") \
.mode("Append") \
.save()
3. execution
//local
$SPARK_HOME/spark-submit --master local --jars ./libs/kudu-spark2_2.11-1.10.0.jar ./app/kudu_ex.py
// On Yarn
$SPARK_HOME/spark-submit --master yarn --deploy-mode cluster --jars ./libs/kudu-spark2_2.11-1.10.0.jar ./app/kudu_ex.py