Init the repository

OndrejKucera · Feb 28, 2018 · 1c3b7b6 · 1c3b7b6
1 parent 7087dd8
commit 1c3b7b6
Show file tree

Hide file tree

Showing 35 changed files with 158,813 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,9 +14,15 @@
 *.jar
 *.war
 *.ear
-*.zip
 *.tar.gz
 *.rar
 
 # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
 hs_err_pid*
+
+# other
+.idea/
+metastore_db/
+project/
+target/
+*.iml
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,66 @@
+## Sources - https://github.com/P7h/docker-spark
+
+FROM openjdk:8
+MAINTAINER Ondrej Kucera <[email protected]>
+
+# Scala related variables.
+ARG SCALA_VERSION=2.12.2
+ARG SCALA_BINARY_ARCHIVE_NAME=scala-${SCALA_VERSION}
+ARG SCALA_BINARY_DOWNLOAD_URL=http://downloads.lightbend.com/scala/${SCALA_VERSION}/${SCALA_BINARY_ARCHIVE_NAME}.tgz
+
+# SBT related variables.
+ARG SBT_VERSION=0.13.15
+ARG SBT_BINARY_ARCHIVE_NAME=sbt-${SBT_VERSION}
+ARG SBT_BINARY_DOWNLOAD_URL=https://dl.bintray.com/sbt/native-packages/sbt/${SBT_VERSION}/${SBT_BINARY_ARCHIVE_NAME}.tgz
+
+# Maven related variables
+ARG MVN_VERSION=3.5.2
+ARG MVN_BINARY_ARCHIVE_NAME=apache-maven-${MVN_VERSION}
+ARG MVN_BINARY_DOWNLOAD_URL=http://www-us.apache.org/dist/maven/maven-3/${MVN_VERSION}/binaries/${MVN_BINARY_ARCHIVE_NAME}-bin.tar.gz
+
+# Spark related variables.
+ARG SPARK_VERSION=2.2.0
+ARG SPARK_BINARY_ARCHIVE_NAME=spark-${SPARK_VERSION}-bin-hadoop2.7
+ARG SPARK_BINARY_DOWNLOAD_URL=http://d3kbcqa49mib13.cloudfront.net/${SPARK_BINARY_ARCHIVE_NAME}.tgz
+
+# Configure env variables for Scala, SBT and Spark.
+# Also configure PATH env variable to include binary folders of Java, Scala, SBT and Spark.
+ENV SCALA_HOME  /usr/local/scala
+ENV SBT_HOME    /usr/local/sbt
+ENV MVN_HOME    /usr/local/mvn
+ENV SPARK_HOME  /usr/local/spark
+ENV PATH        $JAVA_HOME/bin:$SCALA_HOME/bin:$SBT_HOME/bin:$MVN_HOME/bin:$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH
+
+# Download, uncompress and move all the required packages and libraries to their corresponding directories in /usr/local/ folder.
+RUN apt-get -yqq update && \
+    apt-get install -yqq vim less screen tmux && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* && \
+    rm -rf /tmp/* && \
+    wget -qO - ${SCALA_BINARY_DOWNLOAD_URL} | tar -xz -C /usr/local/ && \
+    wget -qO - ${SBT_BINARY_DOWNLOAD_URL} | tar -xz -C /usr/local/  && \
+    wget -qO - ${MVN_BINARY_DOWNLOAD_URL} | tar -xz -C /usr/local/  && \
+    wget -qO - ${SPARK_BINARY_DOWNLOAD_URL} | tar -xz -C /usr/local/ && \
+    cd /usr/local/ && \
+    ln -s ${SCALA_BINARY_ARCHIVE_NAME} scala && \
+    ln -s ${MVN_BINARY_ARCHIVE_NAME} mvn && \
+    ln -s ${SPARK_BINARY_ARCHIVE_NAME} spark && \
+    cp spark/conf/log4j.properties.template spark/conf/log4j.properties && \
+    sed -i -e s/WARN/ERROR/g spark/conf/log4j.properties && \
+    sed -i -e s/INFO/ERROR/g spark/conf/log4j.properties
+
+# We will be running our Spark jobs as `root` user.
+USER root
+
+# Working directory is set to the home folder of `root` user.
+ARG MAIN_DIR=workshop-spark
+RUN mkdir /root/${MAIN_DIR}
+WORKDIR /root/${MAIN_DIR}
+
+# Expose ports for monitoring.
+# SparkContext web UI on 4040 -- only available for the duration of the application.
+# Spark master’s web UI on 8080.
+# Spark worker web UI on 8081.
+EXPOSE 4040 8080 8081
+
+CMD ["/bin/bash"]
diff --git a/README.md b/README.md
@@ -1 +1,49 @@
-# workshop-spark
+# Workshop: An Intorduction to Apache Spark - 101    
+![](http://spark.apache.org/docs/latest/img/spark-logo-hd.png)
+
+   When you will go thought the workshop you will get to know what is distributed computing, differences between approaches of MapReduce and Spark, basic Spark architecture. You will be able to start [Spark](https://spark.apache.org/) job in the standalone cluster and work with basic [Spark API](https://spark.apache.org/docs/latest/api/scala/index.html) - RDD and Datasets/DataFrames. The workshop focus only on Spark SQL module.
+> **NOTE** This workshop was initially created for the [DevFest 2017](https://2017.devfest.cz/speakers/42) in Prague.
+___
+
+## Set the environment
+  As the first step you have to set your [Spark environment](environment.md) to get everything work. It includes docker installation and description how to run docker container where Apache Spark will be ready to use.
+___
+
+## Distributed computing
+  [Let's find out](distribution.md) why to choose distributed computing approach and what it actually means. 
+___
+## Differences between MapReduce and Spark
+  Why isn't the MapReduce's approach good enough and what are differences from Spark? You can read [here](mapreduce.md).
+___
+
+## Spark’s Basic Architecture
+  In order to understand how to use Spark, it is good to understand the basics of [Spark architecture](architecture.md).
+___
+
+## Tasks
+
+#### Task 0: The First Run of Spark
+  Get to know the Spark, Spark REPL and run your first job.
+  * scala: [link](scala/task0-firstrun/README.md)
+  * java: [link](java/task0-firstrun/README.md)
+___
+
+#### Task 1: Word-count
+  You will write your first Spark application. The word-count is the "hello world" in the distribution computation.
+  * scala: [link](scala/task1-wordcount/README.md)
+  * java: [link](java/task1-wordcount/README.md)
+___
+
+#### Task 2: Analyzing Flight Delays
+  You will analyze real data with help RDD and Dataset.
+  * scala: [link](scala/task2-flights/README.md)
+  * java: [link](java/task2-flights/README.md)
+___
+
+#### Task 3: Run both spark jobs in the cluster (optional)
+  You can submit and run all spark jobs on Spark standalone cluster in cluster deploy mode.
+  * scala: [link](scala/task3/README.md)
+  * java: [link](java/task3/README.md)
+___
+
+Recommendation for further reading: [Spark: The Definitive Guide](http://shop.oreilly.com/product/0636920034957.do)
diff --git a/architecture.md b/architecture.md
@@ -0,0 +1,5 @@
+## Spark architecture
+???
+___
+
+#### 1 ...
diff --git a/conf/history/spark-defaults.conf b/conf/history/spark-defaults.conf
@@ -0,0 +1,4 @@
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+spark.history.fs.logDirectory  file:/tmp/spark-events
diff --git a/conf/master/spark-defaults.conf b/conf/master/spark-defaults.conf
@@ -0,0 +1,15 @@
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+spark.driver.port 7001
+spark.fileserver.port 7002
+spark.broadcast.port 7003
+spark.replClassServer.port 7004
+spark.blockManager.port 7005
+spark.executor.port 7006
+
+spark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory
+spark.port.maxRetries 4
+
+spark.eventLog.dir file:/tmp/spark-events
+spark.eventLog.enabled true
diff --git a/conf/worker/spark-defaults.conf b/conf/worker/spark-defaults.conf
@@ -0,0 +1,15 @@
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+#spark.driver.port 7101
+spark.fileserver.port 7012
+spark.broadcast.port 7013
+spark.replClassServer.port 7014
+spark.blockManager.port 7015
+spark.executor.port 7016
+
+spark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory
+spark.port.maxRetries 4
+
+spark.eventLog.dir file:/tmp/spark-events
+spark.eventLog.enabled true