Install java for spark and R-datascience images (for packages like RJ…

…Demetra) (#181) * moved java install in separate .sh, added libbz2-dev for jdk compiler * typos since jupyter-spark-r does not exist * make use of JAVA_HOME for Renviron, added java test for spark images * dummy changes for matrix to build from base * dev * debug * no spaces * debug * debug more * tweaked main-workflow to copy scripts in opt for all images * typo in README + debug mode * debug more * typo * put also script install-java for rdatascience * forgot java version variables for r-datascience * apt-get update in r-datascience before install-java.sh * took ouf debug code * added ENV in dockerfiles so vars are passed to containers * move libbz2 into install-jva.sh for
InseeFrLab · Feb 9, 2024 · b563e68 · b563e68
1 parent 0731bbc
commit b563e68
Show file tree

Hide file tree

Showing 8 changed files with 44 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ They can be used alone but are designed to work with the [Onyxia](https://github
       PM-->PYTENSORFLOW[python-tensorflow]:::package;
       PM-->PYTORCH[python-pytorch]:::package;
       RM-->RDS[r-datascience]:::package;
-      RM-->RSPARK[r-sparkr]:::package;
+      RM-->RSPARK[sparkr]:::package;
       RM-->RPYJU[r-python-julia]:::package;
       PYSPARK--> JPYSPARK[jupyter-pyspark]:::ide;
       PYDS--> JPYDS[jupyter-python]:::ide;
@@ -31,7 +31,6 @@ They can be used alone but are designed to work with the [Onyxia](https://github
       PYDS--> VSCODEPYDS[vscode-python]:::ide;
       PYTENSORFLOW--> VSCODEPYTENSORFLOW[vscode-tensorflow]:::ide;
       PYTORCH--> VSCODEPYTORCH[vscode-pytorch]:::ide;
-      RSPARK -->JSPARKR[jupyter-sparkr]:::ide;
       RDS--> JRDS[jupyter-r]:::ide;
       RSPARK -->RSTUDIOSPARKR[rstudio-sparkr]:::ide;
       RDS--> RSTUDIORDS[rstudio-r]:::ide;
@@ -65,7 +64,7 @@ If Onyxia support is checked, it means that the Onyxia product could inject auto
 There is multiple recipes:
 
 **Your user has non root capabilities:**
-- use an init script : https://github.com/InseeFrLab/images-datascience/blob/main/base/common-scripts/onyxia-init.sh#L7
+- use an init script : https://github.com/InseeFrLab/images-datascience/blob/main/scripts/onyxia-init.sh#L7
 - you can use an init region script location injected by Onyxia with a curl to an endpoint with your certiticates and put it in a path let's say /tmp/ca-certificates:
 - put this path in env variable PATH_TO_CABUNDLE then onyxia-init.sh script will configure git, pip and conda to user this certificates.
 

diff --git a/r-datascience/Dockerfile b/r-datascience/Dockerfile
@@ -3,10 +3,18 @@ FROM $BASE_IMAGE
 
 LABEL maintainer="InseeFrLab <[email protected]>"
 
+ARG JAVA_VERSION="17"
+ENV JAVA_VERSION=${JAVA_VERSION}
+ENV JAVA_HOME="/usr/lib/jvm/java-$JAVA_VERSION-openjdk-amd64"
+ENV PATH="${JAVA_HOME}/bin:${PATH}"
+
 USER root
 
 # Install additional libraries and R packages for datascience
-RUN /opt/install-quarto.sh && \
+RUN apt-get update && \
+    # Install JDK
+    /opt/install-java.sh && \
+    /opt/install-quarto.sh && \
     # Install Shiny Server
     /rocker_scripts/install_shiny_server.sh && \
     # Install packages bundles from rocker

diff --git a/r-datascience/tests.yaml b/r-datascience/tests.yaml
@@ -44,3 +44,7 @@ commandTests:
     command: "which"
     args: ["quarto"]
     expectedOutput: ["/usr/local/bin/quarto"]
+  - name: "Does the binary exists?"
+    command: "which"
+    args: ["java"]
+    expectedOutput: ["/usr/lib/jvm/java-17-openjdk-amd64/bin/java"]
diff --git a/scripts/install-java.sh b/scripts/install-java.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -e
+
+apt-get install -y --no-install-recommends \
+    ca-certificates-java \
+    openjdk-${JAVA_VERSION}-jre-headless \
+    openjdk-${JAVA_VERSION}-jdk-headless \
+    libbz2-dev # for jdk
+
+if command -v R; then
+    R CMD javareconf
+fi
diff --git a/scripts/install-spark-hadoop-hive.sh b/scripts/install-spark-hadoop-hive.sh
@@ -4,7 +4,6 @@ set -e
 HADOOP_VERSION="3.3.6"
 HIVE_VERSION="2.3.9"
 HIVE_LISTENER_VERSION="0.0.3"
-JAVA_VERSION="17"
 
 export SPARK_BUILD_S3_BUCKET="https://minio.lab.sspcloud.fr/projet-onyxia/build"
 export SPARK_BUILD_NAME="spark-${SPARK_VERSION}-bin-hadoop-${HADOOP_VERSION}-hive-${HIVE_VERSION}-java-${JAVA_VERSION}"

diff --git a/scripts/onyxia-init.sh b/scripts/onyxia-init.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-echo "start of onyxia-init.sh script en tant que :"
+echo "start of onyxia-init.sh script as user :"
 whoami
 
 sudo true -nv 2>&1
@@ -141,8 +141,8 @@ if command -v R; then
     echo -e "SPARK_HOME=$SPARK_HOME" >> ${R_HOME}/etc/Renviron.site
     echo -e "HADOOP_HOME=$HADOOP_HOME" >> ${R_HOME}/etc/Renviron.site
     echo -e "HADOOP_OPTIONAL_TOOLS=$HADOOP_OPTIONAL_TOOLS" >> ${R_HOME}/etc/Renviron.site
-    if [[ -e "/usr/lib/jvm/adoptopenjdk-8-hotspot-amd64" ]]; then
-        echo -e "JAVA_HOME=/usr/lib/jvm/adoptopenjdk-8-hotspot-amd64" >> ${R_HOME}/etc/Renviron.site
+    if [[ -e "$JAVA_HOME" ]]; then
+        echo -e "JAVA_HOME=$JAVA_HOME" >> ${R_HOME}/etc/Renviron.site
     fi
     env | grep "KUBERNETES" >> ${R_HOME}/etc/Renviron.site
     env | grep "IMAGE_NAME" >> ${R_HOME}/etc/Renviron.site

diff --git a/spark/Dockerfile b/spark/Dockerfile
@@ -6,27 +6,30 @@ LABEL maintainer="InseeFrLab <[email protected]>"
 ARG BASE_IMAGE
 
 ARG SPARK_VERSION="3.5.0"
+
 ENV SPARK_VERSION=${SPARK_VERSION}
+ENV JAVA_VERSION="17"
+
+ENV JAVA_HOME="/usr/lib/jvm/java-$JAVA_VERSION-openjdk-amd64"
+ENV PATH="${JAVA_HOME}/bin:${PATH}"
 
 ENV HADOOP_HOME="/opt/hadoop"
 ENV SPARK_HOME="/opt/spark"
 ENV HIVE_HOME="/opt/hive"
 ENV PYTHONPATH="$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip"
 ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M"
-ENV JAVA_HOME="/usr/lib/jvm/java-17-openjdk-amd64"
+
 ENV HADOOP_OPTIONAL_TOOLS="hadoop-aws"
 
-ENV PATH="${JAVA_HOME}/bin:${SPARK_HOME}/bin:${HADOOP_HOME}/bin:${PATH}"
+ENV PATH="${SPARK_HOME}/bin:${HADOOP_HOME}/bin:${PATH}"
 
 USER root
 
 RUN --mount=type=secret,id=github_token \
     apt-get update  && \
     # Install JDK
-    apt-get install -y --no-install-recommends \
-        ca-certificates-java \
-        openjdk-17-jre-headless && \
-    # Install Spark/Hadoop/Hive
+    /opt/install-java.sh && \
+    # Install Spark/Hadoop/Hive 
     /opt/install-spark-hadoop-hive.sh && \
     # Put Spark config in the right place
     cp /opt/spark-env.sh $SPARK_HOME/conf && \

diff --git a/spark/tests.yaml b/spark/tests.yaml
@@ -53,4 +53,8 @@ commandTests:
   - name: "Does the binary exists?"
     command: "which"
     args: ["argo"]
-    expectedOutput: ["/usr/local/bin/argo"]
+    expectedOutput: ["/usr/local/bin/argo"]
+  - name: "Does the binary exists?"
+    command: "which"
+    args: ["java"]
+    expectedOutput: ["/usr/lib/jvm/java-17-openjdk-amd64/bin/java"]