Release Spark3.5 (#148)

mahendruajay · Ajay Mahendru · web-flow · commit 0e4fd332deb7 · 2024-03-13T17:02:50.000-07:00
* Build Spark 3.5 * Update Dockerfile to build for AL2023 * release Spark 3.4-cpu-py39-v1.1 (#141) Co-authored-by: Ajay Mahendru <mahajay@amazon.com> * Build Spark 3.5 * release Spark 3.4-cpu-py39-v1.1 (#141) Co-authored-by: Ajay Mahendru <mahajay@amazon.com> * 3.4 cpu py39 v1.1 (minor fixes for hadoop-aws integration with aws-java-sdk-v2) (#142) * release Spark 3.4-cpu-py39-v1.1 * make fixes for hadoop-aws integration with aws-java-sdk-v2 --------- Co-authored-by: Ajay Mahendru <mahajay@amazon.com> * 3.4 cpu py39 v1.1 (#143) * release Spark 3.4-cpu-py39-v1.1 * make fixes for hadoop-aws integration with aws-java-sdk-v2 * fix lint errors --------- Co-authored-by: Ajay Mahendru <mahajay@amazon.com> * 3.4 cpu py39 v1.1 (#144) * release Spark 3.4-cpu-py39-v1.1 * make fixes for hadoop-aws integration with aws-java-sdk-v2 * fix lint errors * Fix test expectations for a job failed with AlgorithError as we ramp up traffic for V2 stack --------- Co-authored-by: Ajay Mahendru <mahajay@amazon.com> * Build Spark 3.5 * Add aws-java-sdk-v2 on Spark paths * release Spark 3.4-cpu-py39-v1.1 (#141) Co-authored-by: Ajay Mahendru <mahajay@amazon.com> * 3.4 cpu py39 v1.1 (#146) * release Spark 3.4-cpu-py39-v1.1 * make fixes for hadoop-aws integration with aws-java-sdk-v2 * fix lint errors * Fix test expectations for a job failed with AlgorithError as we ramp up traffic for V2 stack * updating OS libs to address more CVEs --------- Co-authored-by: Ajay Mahendru <mahajay@amazon.com> * Build Spark 3.5 * release Spark 3.4-cpu-py39-v1.1 (#141) Co-authored-by: Ajay Mahendru <mahajay@amazon.com> * Build Spark 3.5 release: 1/ Update to EMR 7.0.0 2/Update base image to use AL2023 * update Dockerfile for Spark 3.5 to optimize cache clean * Fix SM_VERSION for Spark 3.5 release --------- Co-authored-by: Ajay Mahendru <mahajay@amazon.com>
diff --git a/Makefile b/Makefile
@@ -7,10 +7,10 @@ SHELL          := /bin/sh
 
 # Set variables if testing locally
 ifeq ($(IS_RELEASE_BUILD),)
-    SPARK_VERSION := 3.4
+    SPARK_VERSION := 3.5
     PROCESSOR := cpu
     FRAMEWORK_VERSION := py39
-    SM_VERSION := 1.1
+    SM_VERSION := 1.0
     USE_CASE := processing
     BUILD_CONTEXT := ./spark/${USE_CASE}/${SPARK_VERSION}/py3
 
diff --git a/new_images.yml b/new_images.yml
@@ -1,6 +1,6 @@
 ---
 new_images:
-  - spark: "3.4"
+  - spark: "3.5"
     use-case: "processing"
     processors: ["cpu"]
     python: ["py39"]
diff --git a/spark/processing/3.5/py3/container-bootstrap-config/bootstrap.sh b/spark/processing/3.5/py3/container-bootstrap-config/bootstrap.sh
@@ -0,0 +1 @@
+echo "Not implemented"
diff --git a/spark/processing/3.5/py3/docker/py39/Dockerfile.cpu b/spark/processing/3.5/py3/docker/py39/Dockerfile.cpu
@@ -0,0 +1,142 @@
+FROM public.ecr.aws/amazonlinux/amazonlinux:2023
+ARG REGION
+ENV AWS_REGION ${REGION}
+
+RUN rpm -q system-release --qf '%{VERSION}'
+
+RUN dnf clean all \
+    && dnf update -y \
+    && dnf install -y awscli vim gcc gzip unzip zip tar wget liblapack* libblas* libopenblas* \
+    && dnf install -y openssl openssl-devel \
+    && dnf install -y kernel kernel-headers kernel-devel \
+    && dnf install -y bzip2-devel libffi-devel sqlite-devel xz-devel \
+    && dnf install -y ncurses ncurses-compat-libs binutils \
+    && dnf install -y nss-softokn-freebl avahi-libs avahi dbus dbus-libs \
+    && dnf install -y python-pillow
+
+# Install python 3.9
+ARG PYTHON_BASE_VERSION=3.9
+ARG PYTHON_WITH_BASE_VERSION=python${PYTHON_BASE_VERSION}
+ARG PIP_WITH_BASE_VERSION=pip${PYTHON_BASE_VERSION}
+ARG PYTHON_VERSION=${PYTHON_BASE_VERSION}.18
+RUN dnf groupinstall -y 'Development Tools' \
+    && wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \
+    && tar xzf Python-${PYTHON_VERSION}.tgz \
+    && cd Python-*/ \
+    && ./configure --enable-optimizations \
+    && make altinstall \
+    && echo -e 'alias python3=python3.9\nalias pip3=pip3.9' >> ~/.bashrc \
+    && ln -s $(which ${PYTHON_WITH_BASE_VERSION}) /usr/local/bin/python3 \
+    && ln -s $(which ${PIP_WITH_BASE_VERSION}) /usr/local/bin/pip3 \
+    && cd .. \
+    && rm Python-${PYTHON_VERSION}.tgz \
+    && rm -rf Python-${PYTHON_VERSION}
+
+#Amazon Linux 2023 uses dnf instead of yum as pacakge management tool: https://docs.aws.amazon.com/linux/al2023/ug/package-management.html
+
+# Copied from EMR: https://tiny.amazon.com/kycbidpc/codeamazpackAwsCblob51c8src
+RUN dnf install -y java-1.8.0-amazon-corretto-devel nginx python3-virtualenv \
+    && dnf -y clean all && rm -rf /var/cache/dnf
+
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+# http://blog.stuart.axelbrooke.com/python-3-on-spark-return-of-the-pythonhashseed
+ENV PYTHONHASHSEED 0
+ENV PYTHONIOENCODING UTF-8
+ENV PIP_DISABLE_PIP_VERSION_CHECK 1
+
+# Install EMR Spark/Hadoop
+ENV HADOOP_HOME /usr/lib/hadoop
+ENV HADOOP_CONF_DIR /usr/lib/hadoop/etc/hadoop
+ENV SPARK_HOME /usr/lib/spark
+
+COPY yum/emr-apps.repo /etc/yum.repos.d/emr-apps.repo
+
+# Install hadoop / spark dependencies from EMR's yum repository for Spark optimizations.
+# replace placeholder with region in repository URL
+RUN sed -i "s/REGION/${AWS_REGION}/g" /etc/yum.repos.d/emr-apps.repo
+RUN ls /etc/yum.repos.d/emr-apps.repo
+RUN cat /etc/yum.repos.d/emr-apps.repo
+RUN adduser -N hadoop
+
+# These packages are a subset of what EMR installs in a cluster with the
+# "hadoop", "spark", and "hive" applications.
+# They include EMR-optimized libraries and extras.
+RUN dnf install -y aws-hm-client \
+    aws-java-sdk \
+    emr-goodies \
+    emr-scripts \
+    emr-s3-select \
+    emrfs \
+    hadoop \
+    hadoop-client \
+    hadoop-hdfs \
+    hadoop-hdfs-datanode \
+    hadoop-hdfs-namenode \
+    hadoop-httpfs \
+    hadoop-kms \
+    hadoop-lzo \
+    hadoop-mapreduce \
+    hadoop-yarn \
+    hadoop-yarn-nodemanager \
+    hadoop-yarn-proxyserver \
+    hadoop-yarn-resourcemanager \
+    hadoop-yarn-timelineserver \
+    hive \
+    hive-hcatalog \
+    hive-hcatalog-server \
+    hive-jdbc \
+    hive-server2 \
+    s3-dist-cp \
+    spark-core \
+    spark-datanucleus \
+    spark-history-server \
+    spark-python \
+    && dnf -y clean all \
+    && rm -rf /var/cache/dnf /var/lib/dnf/* /etc/yum.repos.d/emr-*
+
+# Point Spark at proper python binary
+ENV PYSPARK_PYTHON=/usr/local/bin/python3.9
+
+# Setup Spark/Yarn/HDFS user as root
+ENV PATH="/usr/bin:/opt/program:${PATH}"
+ENV YARN_RESOURCEMANAGER_USER="root"
+ENV YARN_NODEMANAGER_USER="root"
+ENV HDFS_NAMENODE_USER="root"
+ENV HDFS_DATANODE_USER="root"
+ENV HDFS_SECONDARYNAMENODE_USER="root"
+
+# Set up bootstrapping program and Spark configuration
+COPY hadoop-config /opt/hadoop-config
+COPY nginx-config /opt/nginx-config
+COPY aws-config /opt/aws-config
+COPY Pipfile Pipfile.lock setup.py *.whl /opt/program/
+ENV PIPENV_PIPFILE=/opt/program/Pipfile
+# Use --system flag, so it will install all packages into the system python,
+# and not into the virtualenv. Since docker containers do not need to have virtualenvs
+# pipenv > 2022.4.8 fails to build smspark
+RUN /usr/local/bin/python3.9 -m pip --version
+RUN /usr/local/bin/python3.9 -m pip install --upgrade pip
+RUN /usr/local/bin/python3.9 -m pip install --upgrade pip setuptools wheel
+
+RUN /usr/local/bin/python3.9 -m pip install pipenv==2022.4.8 \
+    && pipenv install --system \
+    && /usr/local/bin/python3.9 -m pip install /opt/program/*.whl
+
+# Setup container bootstrapper
+COPY container-bootstrap-config /opt/container-bootstrap-config
+RUN chmod +x /opt/container-bootstrap-config/bootstrap.sh \
+    && /opt/container-bootstrap-config/bootstrap.sh
+
+# With this config, spark history server will not run as daemon, otherwise there
+# will be no server running and container will terminate immediately
+ENV SPARK_NO_DAEMONIZE TRUE
+
+WORKDIR $SPARK_HOME
+
+# Install the sagemaker feature store spark connector
+# https://docs.aws.amazon.com/sagemaker/latest/dg/batch-ingestion-spark-connector-setup.html
+# Feature store connector library currently does not support spark 3.4 so commenting out this line
+# RUN /usr/local/bin/python3.9 -m pip install sagemaker-feature-store-pyspark-3.3==1.1.2 --no-binary :all:
+
+ENTRYPOINT ["smspark-submit"]
diff --git a/spark/processing/3.5/py3/hadoop-config/core-site.xml b/spark/processing/3.5/py3/hadoop-config/core-site.xml
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+ <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+ <!-- Put site-specific property overrides in this file. -->
+
+ <configuration>
+     <property>
+         <name>fs.defaultFS</name>
+         <value>hdfs://nn_uri/</value>
+         <description>NameNode URI</description>
+     </property>
+     <property>
+         <name>fs.s3a.aws.credentials.provider</name>
+         <value>com.amazonaws.auth.DefaultAWSCredentialsProviderChain</value>
+         <description>AWS S3 credential provider</description>
+     </property>
+     <property>
+         <name>fs.s3.impl</name>
+         <value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
+         <description>s3a filesystem implementation</description>
+     </property>
+     <property>
+         <name>fs.AbstractFileSystem.s3a.imp</name>
+         <value>org.apache.hadoop.fs.s3a.S3A</value>
+         <description>s3a filesystem implementation</description>
+     </property>
+     <property>
+         <name>fs.s3a.connection.maximum</name>
+         <value>100</value>
+         <description>s3a filesystem maximum connection</description>
+     </property>
+ </configuration>
diff --git a/spark/processing/3.5/py3/hadoop-config/hdfs-site.xml b/spark/processing/3.5/py3/hadoop-config/hdfs-site.xml
@@ -0,0 +1,67 @@
+<?xml version="1.0" encoding="UTF-8"?>
+ <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+ <!-- Put site-specific property overrides in this file. -->
+
+ <configuration>
+    <property>
+        <name>dfs.datanode.data.dir</name>
+        <value>file:///opt/amazon/hadoop/hdfs/datanode</value>
+        <description>Comma separated list of paths on the local filesystem of a DataNode where it should store its\
+  blocks.</description>
+    </property>
+
+    <property>
+        <name>dfs.namenode.name.dir</name>
+        <value>file:///opt/amazon/hadoop/hdfs/namenode</value>
+        <description>Path on the local filesystem where the NameNode stores the namespace and transaction logs per\
+ sistently.</description>
+    </property>
+
+	<!-- Fix for "Failed to replace a bad datanode on the existing pipeline due to no more good datanodes being available to try"
+		 From https://community.cloudera.com/t5/Support-Questions/Failed-to-replace-a-bad-datanode-on-the-existing-pipeline/td-p/207711
+		 This issue can be caused by Continuous network issues causing or repeated packet drops. This specially happens when data is
+		 being written to any one of the DataNode which is in process of pipelining the data to next datanode and due to any communicaiton
+		 issue it may lead to pipeline failure. We are only see this issue in small regions. -->
+	<property>
+		<name>dfs.client.block.write.replace-datanode-on-failure.enable</name>
+		<value>true</value>
+		<description>
+			If there is a datanode/network failure in the write pipeline,
+			DFSClient will try to remove the failed datanode from the pipeline
+			and then continue writing with the remaining datanodes. As a result,
+			the number of datanodes in the pipeline is decreased. The feature is
+			to add new datanodes to the pipeline.
+
+			This is a site-wide property to enable/disable the feature.
+
+			When the cluster size is extremely small, e.g. 3 nodes or less, cluster
+			administrators may want to set the policy to NEVER in the default
+			configuration file or disable this feature. Otherwise, users may
+			experience an unusually high rate of pipeline failures since it is
+			impossible to find new datanodes for replacement.
+
+			See also dfs.client.block.write.replace-datanode-on-failure.policy
+		</description>
+	</property>
+
+	<property>
+		<name>dfs.client.block.write.replace-datanode-on-failure.policy</name>
+		<value>ALWAYS</value>
+		<description>
+			This property is used only if the value of
+			dfs.client.block.write.replace-datanode-on-failure.enable is true.
+
+			ALWAYS: always add a new datanode when an existing datanode is
+			removed.
+
+			NEVER: never add a new datanode.
+
+			DEFAULT:
+			Let r be the replication number.
+			Let n be the number of existing datanodes.
+			Add a new datanode only if r is greater than or equal to 3 and either
+			(1) floor(r/2) is greater than or equal to n; or
+			(2) r is greater than n and the block is hflushed/appended.
+		</description>
+	 </property>
+ </configuration>
diff --git a/spark/processing/3.5/py3/hadoop-config/spark-defaults.conf b/spark/processing/3.5/py3/hadoop-config/spark-defaults.conf
@@ -0,0 +1,10 @@
+spark.driver.extraClassPath      /usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/aws-java-sdk-v2/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar
+spark.driver.extraLibraryPath    /usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native
+spark.executor.extraClassPath    /usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/aws-java-sdk-v2/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar
+spark.executor.extraLibraryPath  /usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native
+spark.driver.host=sd_host
+spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version=2
+
+# Fix for "Uncaught exception: org.apache.spark.rpc.RpcTimeoutException: Cannot
+# receive any reply from 10.0.109.30:35219 in 120 seconds.""
+spark.rpc.askTimeout=300s
diff --git a/spark/processing/3.5/py3/hadoop-config/spark-env.sh b/spark/processing/3.5/py3/hadoop-config/spark-env.sh
@@ -0,0 +1,3 @@
+#EMPTY FILE AVOID OVERRIDDING ENV VARS
+# Specifically, without copying the empty file, SPARK_HISTORY_OPTS will be overriden, 
+# spark.history.ui.port defaults to 18082, and spark.eventLog.dir defaults to local fs
diff --git a/spark/processing/3.5/py3/hadoop-config/yarn-site.xml b/spark/processing/3.5/py3/hadoop-config/yarn-site.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0"?>
+<!-- Site specific YARN configuration properties -->
+ <configuration>
+     <property>
+         <name>yarn.resourcemanager.hostname</name>
+         <value>rm_hostname</value>
+         <description>The hostname of the RM.</description>
+     </property>
+     <property>
+         <name>yarn.nodemanager.hostname</name>
+         <value>nm_hostname</value>
+         <description>The hostname of the NM.</description>
+     </property>
+     <property>
+         <name>yarn.nodemanager.webapp.address</name>
+         <value>nm_webapp_address</value>
+     </property>
+     <property>
+         <name>yarn.nodemanager.vmem-pmem-ratio</name>
+         <value>5</value>
+         <description>Ratio between virtual memory to physical memory.</description>
+     </property>
+     <property>
+         <name>yarn.resourcemanager.am.max-attempts</name>
+         <value>1</value>
+         <description>The maximum number of application attempts.</description>
+     </property>
+     <property>
+         <name>yarn.nodemanager.env-whitelist</name>
+         <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,YARN_HOME,AWS_CONTAINER_CREDENTIALS_RELATIVE_URI,AWS_REGION</value>
+         <description>Environment variable whitelist</description>
+     </property>
+
+ </configuration>
diff --git a/spark/processing/3.5/py3/nginx-config/default.conf b/spark/processing/3.5/py3/nginx-config/default.conf
@@ -0,0 +1,17 @@
+server {
+    listen 15050;
+    server_name localhost;
+    client_header_buffer_size 128k;
+    large_client_header_buffers 4 128k;
+
+    location ~ ^/history/(.*)/(.*)/jobs/$ {
+        proxy_pass http://localhost:18080/history/$1/jobs/;
+        proxy_redirect http://localhost:18080/history/$1/jobs/ $domain_name/proxy/15050/history/$1/jobs/;
+        expires off;
+    }
+
+    location / {
+        proxy_pass http://localhost:18080;
+        expires off;
+    }
+}
diff --git a/spark/processing/3.5/py3/nginx-config/nginx.conf b/spark/processing/3.5/py3/nginx-config/nginx.conf
diff --git a/spark/processing/3.5/py3/yum/emr-apps.repo b/spark/processing/3.5/py3/yum/emr-apps.repo

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+#EMPTY FILE AVOID OVERRIDDING ENV VARS`
	`2`	`+# Specifically, without copying the empty file, SPARK_HISTORY_OPTS will be overriden,`
	`3`	`+# spark.history.ui.port defaults to 18082, and spark.eventLog.dir defaults to local fs`