我已经用spark创建了一个docker文件,并在其中运行我的spark作业进行测试
FROM python:3.10.9-buster
###########################################
# Upgrade the packages
###########################################
# Download latest listing of available packages:
RUN apt-get -y update
# Upgrade already installed packages:
RUN apt-get -y upgrade
# Install a new package:
###########################################
# install tree package
###########################################
# Install a new package:
RUN apt-get -y install tree
#############################################
# install pipenv
############################################
ENV PIPENV_VENV_IN_PROJECT=1
# ENV PIPENV_VENV_IN_PROJECT=1 is important: it causes the resuling virtual environment to be created as /app/.venv. Without this the environment gets created somewhere surprising, such as /root/.local/share/virtualenvs/app-4PlAip0Q - which makes it much harder to write automation scripts later on.
RUN python -m pip install --upgrade pip
RUN pip install --no-cache-dir pipenv
RUN pip install --no-cache-dir jupyter
RUN pip install --no-cache-dir py4j
RUN pip install --no-cache-dir findspark
#############################################
# install java and spark and hadoop
# Java is required for scala and scala is required for Spark
############################################
# VERSIONS
ENV SPARK_VERSION=3.2.4 \
HADOOP_VERSION=3.2 \
JAVA_VERSION=11
RUN apt-get update --yes && \
apt-get install --yes --no-install-recommends \
"openjdk-${JAVA_VERSION}-jre-headless" \
ca-certificates-java \
curl && \
apt-get clean && rm -rf /var/lib/apt/lists/*
RUN java --version
# DOWNLOAD SPARK AND INSTALL
RUN DOWNLOAD_URL_SPARK="https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" \
&& wget --verbose -O apache-spark.tgz "${DOWNLOAD_URL_SPARK}"\
&& mkdir -p /home/spark \
&& tar -xf apache-spark.tgz -C /home/spark --strip-components=1 \
&& rm apache-spark.tgz
# SET SPARK ENV VARIABLES
ENV SPARK_HOME="/home/spark"
ENV PATH="${SPARK_HOME}/bin/:${PATH}"
# Fix Spark installation for Java 11 and Apache Arrow library
# see: https://github.com/apache/spark/pull/27356, https://spark.apache.org/docs/latest/#downloading
RUN cp -p "${SPARK_HOME}/conf/spark-defaults.conf.template" "${SPARK_HOME}/conf/spark-defaults.conf" && \
echo 'spark.driver.extraJavaOptions -Dio.netty.tryReflectionSetAccessible=true' >> "${SPARK_HOME}/conf/spark-defaults.conf" && \
echo 'spark.executor.extraJavaOptions -Dio.netty.tryReflectionSetAccessible=true' >> "${SPARK_HOME}/conf/spark-defaults.conf"
############################################
# create group and user
############################################
ARG UNAME=simha
ARG UID=1000
ARG GID=1000
RUN cat /etc/passwd
# create group
RUN groupadd -g $GID $UNAME
# create a user with userid 1000 and gid 100
RUN useradd -u $UID -g $GID -m -s /bin/bash $UNAME
# -m creates home directory
# change permissions of /home/simha to 1000:100
RUN chown $UID:$GID /home/simha
###########################################
# add sudo
###########################################
RUN apt-get update --yes
RUN apt-get -y install sudo
RUN apt-get -y install vim
RUN cat /etc/sudoers
RUN echo "$UNAME ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
RUN cat /etc/sudoers
#############################
# spark history server
############################
# ALLOW spark history server (mount sparks_events folder locally to /home/simha/app/spark_events)
RUN echo 'spark.eventLog.enabled true' >> "${SPARK_HOME}/conf/spark-defaults.conf" && \
echo 'spark.eventLog.dir file:///home/simha/app/spark_events' >> "${SPARK_HOME}/conf/spark-defaults.conf" && \
echo 'spark.history.fs.logDirectory file:///home/simha/app/spark_events' >> "${SPARK_HOME}/conf/spark-defaults.conf"
RUN mkdir /home/spark/logs
RUN chown $UID:$GID /home/spark/logs
###########################################
# change working dir and user
###########################################
USER $UNAME
RUN mkdir -p /home/$UNAME/app
WORKDIR /home/$UNAME/app
字符串
并进入Docker容器
hostfolder="$(pwd)"
dockerfolder="/home/simha/app"
docker run --rm -it \
--net="host" \
-v ${hostfolder}:${dockerfolder} \
python_spark_custom_build:latest /bin/bash
型
在这个里面我启动pyspark shell
的数据
所以所有的东西都在一个容器里运行。
我检查Web UI来检查执行器
的
Q1.我看到只有驱动程序,没有工作节点。司机也可以当工人吗
Q2:如何在容器中创建cluser。我想在这个容器中设置1个驱动程序和4个工作节点。从而可以实现并行化。
我计划使用ECS任务来运行我的spark脚本,使用docker容器。我不想使用EMR或胶水。
我很高兴有一个节点(作为工人和驱动程序),因为多个执行器正在运行,所以实现了并行化。
我的理解是驱动程序和执行程序是并行化的核心。不管它们是运行在单独的节点上还是全部运行在一起都是一个节点
1条答案
按热度按时间q3qa4bjr1#
单节点集群是由Apache Spark驱动程序组成的集群,没有Spark worker。
根据Databricks文档。https://docs.databricks.com/clusters/single-node.html
如果您想创建一个多节点集群,请说:1个主人和4个工人,你可以在这里参考这篇中等文章:https://medium.com/@MarinAgli1/setting-up-a-spark-standalone-cluster-on-docker-in-layman-terms-8cbdc9fdd14b的