Skip to content

Commit 0e4fd33

Browse files
mahendruajayAjay Mahendru
andauthored
Release Spark3.5 (#148)
* Build Spark 3.5 * Update Dockerfile to build for AL2023 * release Spark 3.4-cpu-py39-v1.1 (#141) Co-authored-by: Ajay Mahendru <mahajay@amazon.com> * Build Spark 3.5 * release Spark 3.4-cpu-py39-v1.1 (#141) Co-authored-by: Ajay Mahendru <mahajay@amazon.com> * 3.4 cpu py39 v1.1 (minor fixes for hadoop-aws integration with aws-java-sdk-v2) (#142) * release Spark 3.4-cpu-py39-v1.1 * make fixes for hadoop-aws integration with aws-java-sdk-v2 --------- Co-authored-by: Ajay Mahendru <mahajay@amazon.com> * 3.4 cpu py39 v1.1 (#143) * release Spark 3.4-cpu-py39-v1.1 * make fixes for hadoop-aws integration with aws-java-sdk-v2 * fix lint errors --------- Co-authored-by: Ajay Mahendru <mahajay@amazon.com> * 3.4 cpu py39 v1.1 (#144) * release Spark 3.4-cpu-py39-v1.1 * make fixes for hadoop-aws integration with aws-java-sdk-v2 * fix lint errors * Fix test expectations for a job failed with AlgorithError as we ramp up traffic for V2 stack --------- Co-authored-by: Ajay Mahendru <mahajay@amazon.com> * Build Spark 3.5 * Add aws-java-sdk-v2 on Spark paths * release Spark 3.4-cpu-py39-v1.1 (#141) Co-authored-by: Ajay Mahendru <mahajay@amazon.com> * 3.4 cpu py39 v1.1 (#146) * release Spark 3.4-cpu-py39-v1.1 * make fixes for hadoop-aws integration with aws-java-sdk-v2 * fix lint errors * Fix test expectations for a job failed with AlgorithError as we ramp up traffic for V2 stack * updating OS libs to address more CVEs --------- Co-authored-by: Ajay Mahendru <mahajay@amazon.com> * Build Spark 3.5 * release Spark 3.4-cpu-py39-v1.1 (#141) Co-authored-by: Ajay Mahendru <mahajay@amazon.com> * Build Spark 3.5 release: 1/ Update to EMR 7.0.0 2/Update base image to use AL2023 * update Dockerfile for Spark 3.5 to optimize cache clean * Fix SM_VERSION for Spark 3.5 release --------- Co-authored-by: Ajay Mahendru <mahajay@amazon.com>
1 parent e0c5b69 commit 0e4fd33

File tree

12 files changed

+381
-3
lines changed

12 files changed

+381
-3
lines changed

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@ SHELL := /bin/sh
77

88
# Set variables if testing locally
99
ifeq ($(IS_RELEASE_BUILD),)
10-
SPARK_VERSION := 3.4
10+
SPARK_VERSION := 3.5
1111
PROCESSOR := cpu
1212
FRAMEWORK_VERSION := py39
13-
SM_VERSION := 1.1
13+
SM_VERSION := 1.0
1414
USE_CASE := processing
1515
BUILD_CONTEXT := ./spark/${USE_CASE}/${SPARK_VERSION}/py3
1616

new_images.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
---
22
new_images:
3-
- spark: "3.4"
3+
- spark: "3.5"
44
use-case: "processing"
55
processors: ["cpu"]
66
python: ["py39"]
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
echo "Not implemented"
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
FROM public.ecr.aws/amazonlinux/amazonlinux:2023
2+
ARG REGION
3+
ENV AWS_REGION ${REGION}
4+
5+
RUN rpm -q system-release --qf '%{VERSION}'
6+
7+
RUN dnf clean all \
8+
&& dnf update -y \
9+
&& dnf install -y awscli vim gcc gzip unzip zip tar wget liblapack* libblas* libopenblas* \
10+
&& dnf install -y openssl openssl-devel \
11+
&& dnf install -y kernel kernel-headers kernel-devel \
12+
&& dnf install -y bzip2-devel libffi-devel sqlite-devel xz-devel \
13+
&& dnf install -y ncurses ncurses-compat-libs binutils \
14+
&& dnf install -y nss-softokn-freebl avahi-libs avahi dbus dbus-libs \
15+
&& dnf install -y python-pillow
16+
17+
# Install python 3.9
18+
ARG PYTHON_BASE_VERSION=3.9
19+
ARG PYTHON_WITH_BASE_VERSION=python${PYTHON_BASE_VERSION}
20+
ARG PIP_WITH_BASE_VERSION=pip${PYTHON_BASE_VERSION}
21+
ARG PYTHON_VERSION=${PYTHON_BASE_VERSION}.18
22+
RUN dnf groupinstall -y 'Development Tools' \
23+
&& wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \
24+
&& tar xzf Python-${PYTHON_VERSION}.tgz \
25+
&& cd Python-*/ \
26+
&& ./configure --enable-optimizations \
27+
&& make altinstall \
28+
&& echo -e 'alias python3=python3.9\nalias pip3=pip3.9' >> ~/.bashrc \
29+
&& ln -s $(which ${PYTHON_WITH_BASE_VERSION}) /usr/local/bin/python3 \
30+
&& ln -s $(which ${PIP_WITH_BASE_VERSION}) /usr/local/bin/pip3 \
31+
&& cd .. \
32+
&& rm Python-${PYTHON_VERSION}.tgz \
33+
&& rm -rf Python-${PYTHON_VERSION}
34+
35+
#Amazon Linux 2023 uses dnf instead of yum as pacakge management tool: https://docs.aws.amazon.com/linux/al2023/ug/package-management.html
36+
37+
# Copied from EMR: https://tiny.amazon.com/kycbidpc/codeamazpackAwsCblob51c8src
38+
RUN dnf install -y java-1.8.0-amazon-corretto-devel nginx python3-virtualenv \
39+
&& dnf -y clean all && rm -rf /var/cache/dnf
40+
41+
ENV PYTHONDONTWRITEBYTECODE=1
42+
ENV PYTHONUNBUFFERED=1
43+
# http://blog.stuart.axelbrooke.com/python-3-on-spark-return-of-the-pythonhashseed
44+
ENV PYTHONHASHSEED 0
45+
ENV PYTHONIOENCODING UTF-8
46+
ENV PIP_DISABLE_PIP_VERSION_CHECK 1
47+
48+
# Install EMR Spark/Hadoop
49+
ENV HADOOP_HOME /usr/lib/hadoop
50+
ENV HADOOP_CONF_DIR /usr/lib/hadoop/etc/hadoop
51+
ENV SPARK_HOME /usr/lib/spark
52+
53+
COPY yum/emr-apps.repo /etc/yum.repos.d/emr-apps.repo
54+
55+
# Install hadoop / spark dependencies from EMR's yum repository for Spark optimizations.
56+
# replace placeholder with region in repository URL
57+
RUN sed -i "s/REGION/${AWS_REGION}/g" /etc/yum.repos.d/emr-apps.repo
58+
RUN ls /etc/yum.repos.d/emr-apps.repo
59+
RUN cat /etc/yum.repos.d/emr-apps.repo
60+
RUN adduser -N hadoop
61+
62+
# These packages are a subset of what EMR installs in a cluster with the
63+
# "hadoop", "spark", and "hive" applications.
64+
# They include EMR-optimized libraries and extras.
65+
RUN dnf install -y aws-hm-client \
66+
aws-java-sdk \
67+
emr-goodies \
68+
emr-scripts \
69+
emr-s3-select \
70+
emrfs \
71+
hadoop \
72+
hadoop-client \
73+
hadoop-hdfs \
74+
hadoop-hdfs-datanode \
75+
hadoop-hdfs-namenode \
76+
hadoop-httpfs \
77+
hadoop-kms \
78+
hadoop-lzo \
79+
hadoop-mapreduce \
80+
hadoop-yarn \
81+
hadoop-yarn-nodemanager \
82+
hadoop-yarn-proxyserver \
83+
hadoop-yarn-resourcemanager \
84+
hadoop-yarn-timelineserver \
85+
hive \
86+
hive-hcatalog \
87+
hive-hcatalog-server \
88+
hive-jdbc \
89+
hive-server2 \
90+
s3-dist-cp \
91+
spark-core \
92+
spark-datanucleus \
93+
spark-history-server \
94+
spark-python \
95+
&& dnf -y clean all \
96+
&& rm -rf /var/cache/dnf /var/lib/dnf/* /etc/yum.repos.d/emr-*
97+
98+
# Point Spark at proper python binary
99+
ENV PYSPARK_PYTHON=/usr/local/bin/python3.9
100+
101+
# Setup Spark/Yarn/HDFS user as root
102+
ENV PATH="/usr/bin:/opt/program:${PATH}"
103+
ENV YARN_RESOURCEMANAGER_USER="root"
104+
ENV YARN_NODEMANAGER_USER="root"
105+
ENV HDFS_NAMENODE_USER="root"
106+
ENV HDFS_DATANODE_USER="root"
107+
ENV HDFS_SECONDARYNAMENODE_USER="root"
108+
109+
# Set up bootstrapping program and Spark configuration
110+
COPY hadoop-config /opt/hadoop-config
111+
COPY nginx-config /opt/nginx-config
112+
COPY aws-config /opt/aws-config
113+
COPY Pipfile Pipfile.lock setup.py *.whl /opt/program/
114+
ENV PIPENV_PIPFILE=/opt/program/Pipfile
115+
# Use --system flag, so it will install all packages into the system python,
116+
# and not into the virtualenv. Since docker containers do not need to have virtualenvs
117+
# pipenv > 2022.4.8 fails to build smspark
118+
RUN /usr/local/bin/python3.9 -m pip --version
119+
RUN /usr/local/bin/python3.9 -m pip install --upgrade pip
120+
RUN /usr/local/bin/python3.9 -m pip install --upgrade pip setuptools wheel
121+
122+
RUN /usr/local/bin/python3.9 -m pip install pipenv==2022.4.8 \
123+
&& pipenv install --system \
124+
&& /usr/local/bin/python3.9 -m pip install /opt/program/*.whl
125+
126+
# Setup container bootstrapper
127+
COPY container-bootstrap-config /opt/container-bootstrap-config
128+
RUN chmod +x /opt/container-bootstrap-config/bootstrap.sh \
129+
&& /opt/container-bootstrap-config/bootstrap.sh
130+
131+
# With this config, spark history server will not run as daemon, otherwise there
132+
# will be no server running and container will terminate immediately
133+
ENV SPARK_NO_DAEMONIZE TRUE
134+
135+
WORKDIR $SPARK_HOME
136+
137+
# Install the sagemaker feature store spark connector
138+
# https://docs.aws.amazon.com/sagemaker/latest/dg/batch-ingestion-spark-connector-setup.html
139+
# Feature store connector library currently does not support spark 3.4 so commenting out this line
140+
# RUN /usr/local/bin/python3.9 -m pip install sagemaker-feature-store-pyspark-3.3==1.1.2 --no-binary :all:
141+
142+
ENTRYPOINT ["smspark-submit"]
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3+
<!-- Put site-specific property overrides in this file. -->
4+
5+
<configuration>
6+
<property>
7+
<name>fs.defaultFS</name>
8+
<value>hdfs://nn_uri/</value>
9+
<description>NameNode URI</description>
10+
</property>
11+
<property>
12+
<name>fs.s3a.aws.credentials.provider</name>
13+
<value>com.amazonaws.auth.DefaultAWSCredentialsProviderChain</value>
14+
<description>AWS S3 credential provider</description>
15+
</property>
16+
<property>
17+
<name>fs.s3.impl</name>
18+
<value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
19+
<description>s3a filesystem implementation</description>
20+
</property>
21+
<property>
22+
<name>fs.AbstractFileSystem.s3a.imp</name>
23+
<value>org.apache.hadoop.fs.s3a.S3A</value>
24+
<description>s3a filesystem implementation</description>
25+
</property>
26+
<property>
27+
<name>fs.s3a.connection.maximum</name>
28+
<value>100</value>
29+
<description>s3a filesystem maximum connection</description>
30+
</property>
31+
</configuration>
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3+
<!-- Put site-specific property overrides in this file. -->
4+
5+
<configuration>
6+
<property>
7+
<name>dfs.datanode.data.dir</name>
8+
<value>file:///opt/amazon/hadoop/hdfs/datanode</value>
9+
<description>Comma separated list of paths on the local filesystem of a DataNode where it should store its\
10+
blocks.</description>
11+
</property>
12+
13+
<property>
14+
<name>dfs.namenode.name.dir</name>
15+
<value>file:///opt/amazon/hadoop/hdfs/namenode</value>
16+
<description>Path on the local filesystem where the NameNode stores the namespace and transaction logs per\
17+
sistently.</description>
18+
</property>
19+
20+
<!-- Fix for "Failed to replace a bad datanode on the existing pipeline due to no more good datanodes being available to try"
21+
From https://community.cloudera.com/t5/Support-Questions/Failed-to-replace-a-bad-datanode-on-the-existing-pipeline/td-p/207711
22+
This issue can be caused by Continuous network issues causing or repeated packet drops. This specially happens when data is
23+
being written to any one of the DataNode which is in process of pipelining the data to next datanode and due to any communicaiton
24+
issue it may lead to pipeline failure. We are only see this issue in small regions. -->
25+
<property>
26+
<name>dfs.client.block.write.replace-datanode-on-failure.enable</name>
27+
<value>true</value>
28+
<description>
29+
If there is a datanode/network failure in the write pipeline,
30+
DFSClient will try to remove the failed datanode from the pipeline
31+
and then continue writing with the remaining datanodes. As a result,
32+
the number of datanodes in the pipeline is decreased. The feature is
33+
to add new datanodes to the pipeline.
34+
35+
This is a site-wide property to enable/disable the feature.
36+
37+
When the cluster size is extremely small, e.g. 3 nodes or less, cluster
38+
administrators may want to set the policy to NEVER in the default
39+
configuration file or disable this feature. Otherwise, users may
40+
experience an unusually high rate of pipeline failures since it is
41+
impossible to find new datanodes for replacement.
42+
43+
See also dfs.client.block.write.replace-datanode-on-failure.policy
44+
</description>
45+
</property>
46+
47+
<property>
48+
<name>dfs.client.block.write.replace-datanode-on-failure.policy</name>
49+
<value>ALWAYS</value>
50+
<description>
51+
This property is used only if the value of
52+
dfs.client.block.write.replace-datanode-on-failure.enable is true.
53+
54+
ALWAYS: always add a new datanode when an existing datanode is
55+
removed.
56+
57+
NEVER: never add a new datanode.
58+
59+
DEFAULT:
60+
Let r be the replication number.
61+
Let n be the number of existing datanodes.
62+
Add a new datanode only if r is greater than or equal to 3 and either
63+
(1) floor(r/2) is greater than or equal to n; or
64+
(2) r is greater than n and the block is hflushed/appended.
65+
</description>
66+
</property>
67+
</configuration>
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
spark.driver.extraClassPath /usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/aws-java-sdk-v2/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar
2+
spark.driver.extraLibraryPath /usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native
3+
spark.executor.extraClassPath /usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/aws-java-sdk-v2/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar
4+
spark.executor.extraLibraryPath /usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native
5+
spark.driver.host=sd_host
6+
spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version=2
7+
8+
# Fix for "Uncaught exception: org.apache.spark.rpc.RpcTimeoutException: Cannot
9+
# receive any reply from 10.0.109.30:35219 in 120 seconds.""
10+
spark.rpc.askTimeout=300s
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#EMPTY FILE AVOID OVERRIDDING ENV VARS
2+
# Specifically, without copying the empty file, SPARK_HISTORY_OPTS will be overriden,
3+
# spark.history.ui.port defaults to 18082, and spark.eventLog.dir defaults to local fs
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
<?xml version="1.0"?>
2+
<!-- Site specific YARN configuration properties -->
3+
<configuration>
4+
<property>
5+
<name>yarn.resourcemanager.hostname</name>
6+
<value>rm_hostname</value>
7+
<description>The hostname of the RM.</description>
8+
</property>
9+
<property>
10+
<name>yarn.nodemanager.hostname</name>
11+
<value>nm_hostname</value>
12+
<description>The hostname of the NM.</description>
13+
</property>
14+
<property>
15+
<name>yarn.nodemanager.webapp.address</name>
16+
<value>nm_webapp_address</value>
17+
</property>
18+
<property>
19+
<name>yarn.nodemanager.vmem-pmem-ratio</name>
20+
<value>5</value>
21+
<description>Ratio between virtual memory to physical memory.</description>
22+
</property>
23+
<property>
24+
<name>yarn.resourcemanager.am.max-attempts</name>
25+
<value>1</value>
26+
<description>The maximum number of application attempts.</description>
27+
</property>
28+
<property>
29+
<name>yarn.nodemanager.env-whitelist</name>
30+
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,YARN_HOME,AWS_CONTAINER_CREDENTIALS_RELATIVE_URI,AWS_REGION</value>
31+
<description>Environment variable whitelist</description>
32+
</property>
33+
34+
</configuration>
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
server {
2+
listen 15050;
3+
server_name localhost;
4+
client_header_buffer_size 128k;
5+
large_client_header_buffers 4 128k;
6+
7+
location ~ ^/history/(.*)/(.*)/jobs/$ {
8+
proxy_pass http://localhost:18080/history/$1/jobs/;
9+
proxy_redirect http://localhost:18080/history/$1/jobs/ $domain_name/proxy/15050/history/$1/jobs/;
10+
expires off;
11+
}
12+
13+
location / {
14+
proxy_pass http://localhost:18080;
15+
expires off;
16+
}
17+
}

0 commit comments

Comments
 (0)