执行任务

./spark-submit \
--class cn.com.dtmobile.spark.DebugTest \
--master yarn \
--deploy-mode client \
--num-executors \
--executor-cores \
--executor-memory 1G \
/home/etluser/kong/debugTest/pucchSinr.jar

${SPARK_HOME}/bin/spark-submit脚本

if [ -z "${SPARK_HOME}" ]; then
source "$(dirname "$")"/find-spark-home
fi # disable randomized hash for string in Python 3.3+
export PYTHONHASHSEED= exec "${SPARK_HOME}"/bin/spark-class org.apache.spark.deploy.SparkSubmit "$@"

$@表示所有接收的参数:

$@=

--class cn.com.dtmobile.spark.DebugTest --master yarn --deploy-mode client --num-executors  --executor-cores  --executor-memory 1G /home/etluser/kong/debugTest/pucchSinr.jar

最后exec执行的内容:
exec=

/home/etluser/kong/spark/spark-2.3.-bin/spark-2.3.-bin-hadoop2./bin/spark-class org.apache.spark.deploy.SparkSubmit --class cn.com.dtmobile.spark.DebugTest --master yarn --deploy-mode client --num-executors  --executor-cores  --executor-memory 1G /home/etluser/kong/debugTest/pucchSinr.jar

所以传给spark-class的参数为:

 org.apache.spark.deploy.SparkSubmit --class cn.com.dtmobile.spark.DebugTest --master yarn --deploy-mode client --num-executors  --executor-cores  --executor-memory 1G /home/etluser/kong/debugTest/pucchSinr.jar

${SPARK_HOME}/bin/spark-class脚本

if [ -z "${SPARK_HOME}" ]; then #如果${SPARK_HOME}长度为0
source "$(dirname "$")"/find-spark-home
fi . "${SPARK_HOME}"/bin/load-spark-env.sh # Find the java binary
if [ -n "${JAVA_HOME}" ]; then #如果${JAVA_HOME}长度不为0
RUNNER="${JAVA_HOME}/bin/java"
else
if [ "$(command -v java)" ]; then
RUNNER="java"
else
echo "JAVA_HOME is not set" >&
exit
fi
fi # Find Spark jars.
if [ -d "${SPARK_HOME}/jars" ]; then #如果${SPARK_HOME}/jars文件夹存在的话
SPARK_JARS_DIR="${SPARK_HOME}/jars"
else
SPARK_JARS_DIR="${SPARK_HOME}/assembly/target/scala-$SPARK_SCALA_VERSION/jars" #不存在的话就去assembly目录下找
fi if [ ! -d "$SPARK_JARS_DIR" ] && [ -z "$SPARK_TESTING$SPARK_SQL_TESTING" ]; then #如果SPARK_JARS_DIR目录不存在,并且相关测试目录也为空
echo "Failed to find Spark jars directory ($SPARK_JARS_DIR)." >&
echo "You need to build Spark with the target \"package\" before running this program." >&
exit
else
LAUNCH_CLASSPATH="$SPARK_JARS_DIR/*" #指定加载时候的类路径为SPARK_JARS_DIR/所有jar包
fi # Add the launcher build dir to the classpath if requested.
if [ -n "$SPARK_PREPEND_CLASSES" ]; then
LAUNCH_CLASSPATH="${SPARK_HOME}/launcher/target/scala-$SPARK_SCALA_VERSION/classes:$LAUNCH_CLASSPATH"
fi # For tests
if [[ -n "$SPARK_TESTING" ]]; then
unset YARN_CONF_DIR
unset HADOOP_CONF_DIR
fi # The launcher library will print arguments separated by a NULL character, to allow arguments with
# characters that would be otherwise interpreted by the shell. Read that in a while loop, populating
# an array that will be used to exec the final command.
#
# The exit code of the launcher is appended to the output, so the parent shell removes it from the
# command array and checks the value to see if the launcher succeeded.
build_command() {
#$RUNNER为java,调用类路径中的org.apache.spark.launcher.Main类 参数为spark-submit指定的所有参数,在这里调用launcher生成下面jvm command
"$RUNNER" -Xmx128m -cp "$LAUNCH_CLASSPATH" org.apache.spark.launcher.Main "$@"
printf "%d\0" $?
} # Turn off posix mode since it does not allow process substitution
set +o posix
CMD=()
while IFS= read -d '' -r ARG; do
CMD+=("$ARG")
done < <(build_command "$@") COUNT=${#CMD[@]}
LAST=$((COUNT - ))
LAUNCHER_EXIT_CODE=${CMD[$LAST]} # Certain JVM failures result in errors being printed to stdout (instead of stderr), which causes
# the code that parses the output of the launcher to get confused. In those cases, check if the
# exit code is an integer, and if it's not, handle it as a special error case.
if ! [[ $LAUNCHER_EXIT_CODE =~ ^[-]+$ ]]; then
echo "${CMD[@]}" | head -n- >&
exit
fi if [ $LAUNCHER_EXIT_CODE != ]; then
exit $LAUNCHER_EXIT_CODE
fi CMD=("${CMD[@]:0:$LAST}")
exec "${CMD[@]}"

最后exec执行的CMD为:

${CMD[@]}=

/usr/lib/java/jdk1.8.0_144/bin/java -cp \
/home/etluser/kong/spark/spark-2.3.4-bin/spark-2.3.4-bin-hadoop2.6/conf/:/home/etluser/kong/spark/spark-2.3.4-bin/spark-2.3.4-bin-hadoop2.6/jars/* \
-Xmx1g \
org.apache.spark.deploy.SparkSubmit \
--master yarn \
--deploy-mode client \
--class cn.com.dtmobile.spark.DebugTest \
--num-executors 3 \
--executor-cores 2 \
--executor-memory 1G \
/home/etluser/kong/debugTest/pucchSinr.jar

也就是通过上面launcher生成的cmd


org.apache.spark.launcher.Main类

  public static void main(String[] argsArray) throws Exception {
checkArgument(argsArray.length > 0, "Not enough arguments: missing class name.");
//argsArray:spark-submit脚本传给spark-class的参数
List<String> args = new ArrayList<>(Arrays.asList(argsArray));
String className = args.remove(0); //这里className为org.apache.spark.deploy.SparkSubmit boolean printLaunchCommand = !isEmpty(System.getenv("SPARK_PRINT_LAUNCH_COMMAND"));
AbstractCommandBuilder builder;
if (className.equals("org.apache.spark.deploy.SparkSubmit")) {
try {
builder = new SparkSubmitCommandBuilder(args); //通过SparkSubmitCommandBuilder来生成cmd
} catch (IllegalArgumentException e) {
printLaunchCommand = false;
System.err.println("Error: " + e.getMessage());
System.err.println(); MainClassOptionParser parser = new MainClassOptionParser();
try {
parser.parse(args);
} catch (Exception ignored) {
// Ignore parsing exceptions.
} List<String> help = new ArrayList<>();
if (parser.className != null) {
help.add(parser.CLASS);
help.add(parser.className);
}
help.add(parser.USAGE_ERROR);
builder = new SparkSubmitCommandBuilder(help);
}
} else {
builder = new SparkClassCommandBuilder(className, args);
} Map<String, String> env = new HashMap<>();
List<String> cmd = builder.buildCommand(env);//调用buildCommand生成cmd
if (printLaunchCommand) {
System.err.println("Spark Command: " + join(" ", cmd));
System.err.println("========================================");
} if (isWindows()) {
System.out.println(prepareWindowsCommand(cmd, env));
} else {
// In bash, use NULL as the arg separator since it cannot be used in an argument.
List<String> bashCmd = prepareBashCommand(cmd, env);
for (String c : bashCmd) {
System.out.print(c);
System.out.print('\0');
}
}
}

调用SparkSubmitCommandBuilder来生成Spark Submit命令,如下:

org.apache.spark.launcher.SparkSubmitCommandBuilder

//创建SparkSubmitCommandBuilder对象时候,对应的构造方法  
SparkSubmitCommandBuilder(List<String> args) {
this.allowsMixedArguments = false;
this.sparkArgs = new ArrayList<>();
boolean isExample = false;
List<String> submitArgs = args; if (args.size() > 0) {
switch (args.get(0)) {
case PYSPARK_SHELL://对应"pyspark-shell-main"
this.allowsMixedArguments = true;
appResource = PYSPARK_SHELL;
submitArgs = args.subList(1, args.size());
break; case SPARKR_SHELL://对应"sparkr-shell-main"
this.allowsMixedArguments = true;
appResource = SPARKR_SHELL;
submitArgs = args.subList(1, args.size());
break; case RUN_EXAMPLE:
isExample = true;
submitArgs = args.subList(1, args.size());
} this.isExample = isExample;
OptionParser parser = new OptionParser();//为SparkSubmitCommandBuilder的内部类,并且继承了SparkSubmitOptionParser类
parser.parse(submitArgs);//parse方法为optionParser的父类方法,用于解析spark-submit的命令行参数
this.isAppResourceReq = parser.isAppResourceReq;
} else {
this.isExample = isExample;
this.isAppResourceReq = false;
}
}

SparkSubmitOptionParser 类

org.apache.spark.launcher.SparkSubmitOptionParser

package org.apache.spark.launcher;

import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern; /**
* Parser for spark-submit command line options.
* <p>
* This class encapsulates the parsing code for spark-submit command line options, so that there
* is a single list of options that needs to be maintained (well, sort of, but it makes it harder
* to break things).
*/
class SparkSubmitOptionParser { // The following constants define the "main" name for the available options. They're defined
// to avoid copy & paste of the raw strings where they're needed.
//
// The fields are not static so that they're exposed to Scala code that uses this class. See
// SparkSubmitArguments.scala. That is also why this class is not abstract - to allow code to
// easily use these constants without having to create dummy implementations of this class.
protected final String CLASS = "--class";
protected final String CONF = "--conf";
protected final String DEPLOY_MODE = "--deploy-mode";
protected final String DRIVER_CLASS_PATH = "--driver-class-path";
protected final String DRIVER_CORES = "--driver-cores";
protected final String DRIVER_JAVA_OPTIONS = "--driver-java-options";
protected final String DRIVER_LIBRARY_PATH = "--driver-library-path";
protected final String DRIVER_MEMORY = "--driver-memory";
protected final String EXECUTOR_MEMORY = "--executor-memory";
protected final String FILES = "--files";
protected final String JARS = "--jars";
protected final String KILL_SUBMISSION = "--kill";
protected final String MASTER = "--master";
protected final String NAME = "--name";
protected final String PACKAGES = "--packages";
protected final String PACKAGES_EXCLUDE = "--exclude-packages";
protected final String PROPERTIES_FILE = "--properties-file";
protected final String PROXY_USER = "--proxy-user";
protected final String PY_FILES = "--py-files";
protected final String REPOSITORIES = "--repositories";
protected final String STATUS = "--status";
protected final String TOTAL_EXECUTOR_CORES = "--total-executor-cores"; // Options that do not take arguments.
protected final String HELP = "--help";
protected final String SUPERVISE = "--supervise";
protected final String USAGE_ERROR = "--usage-error";
protected final String VERBOSE = "--verbose";
protected final String VERSION = "--version"; // Standalone-only options. // YARN-only options.
protected final String ARCHIVES = "--archives";
protected final String EXECUTOR_CORES = "--executor-cores";
protected final String KEYTAB = "--keytab";
protected final String NUM_EXECUTORS = "--num-executors";
protected final String PRINCIPAL = "--principal";
protected final String QUEUE = "--queue"; /**
* This is the canonical list of spark-submit options. Each entry in the array contains the
* different aliases for the same option; the first element of each entry is the "official"
* name of the option, passed to {@link #handle(String, String)}.
* <p>
* Options not listed here nor in the "switch" list below will result in a call to
* {@link #handleUnknown(String)}.
* <p>
* These two arrays are visible for tests.
*/
final String[][] opts = {
{ ARCHIVES },
{ CLASS },
{ CONF, "-c" },
{ DEPLOY_MODE },
{ DRIVER_CLASS_PATH },
{ DRIVER_CORES },
{ DRIVER_JAVA_OPTIONS },
{ DRIVER_LIBRARY_PATH },
{ DRIVER_MEMORY },
{ EXECUTOR_CORES },
{ EXECUTOR_MEMORY },
{ FILES },
{ JARS },
{ KEYTAB },
{ KILL_SUBMISSION },
{ MASTER },
{ NAME },
{ NUM_EXECUTORS },
{ PACKAGES },
{ PACKAGES_EXCLUDE },
{ PRINCIPAL },
{ PROPERTIES_FILE },
{ PROXY_USER },
{ PY_FILES },
{ QUEUE },
{ REPOSITORIES },
{ STATUS },
{ TOTAL_EXECUTOR_CORES },
}; /**
* List of switches (command line options that do not take parameters) recognized by spark-submit.
*/
final String[][] switches = {
{ HELP, "-h" },
{ SUPERVISE },
{ USAGE_ERROR },
{ VERBOSE, "-v" },
{ VERSION },
}; /**
* Parse a list of spark-submit command line options.
* <p>
* See SparkSubmitArguments.scala for a more formal description of available options.
*
* @throws IllegalArgumentException If an error is found during parsing.
*/
protected final void parse(List<String> args) {
Pattern eqSeparatedOpt = Pattern.compile("(--[^=]+)=(.+)"); int idx = 0;
for (idx = 0; idx < args.size(); idx++) {
String arg = args.get(idx);
String value = null; Matcher m = eqSeparatedOpt.matcher(arg);
if (m.matches()) {
arg = m.group(1);
value = m.group(2);
} // Look for options with a value.主要用于解析opts里定义的参数,比如--class --deploy-mode --num-executors等等之类的参数
String name = findCliOption(arg, opts);
if (name != null) {
if (value == null) {
if (idx == args.size() - 1) {
throw new IllegalArgumentException(
String.format("Missing argument for option '%s'.", arg));
}
idx++;
value = args.get(idx);
}
if (!handle(name, value)) { //handle方法调用的是OptionParser类重写之后的方法,将spark-submit放进来的参数对应值赋到spark对应的变量中
break;
}
continue;
} // Look for a switch.主要用于解析switches类的参数,比如--help --verbose等等这类的
name = findCliOption(arg, switches);
if (name != null) {
if (!handle(name, null)) {
break;
}
continue;
} if (!handleUnknown(arg)) {
break;
}
} if (idx < args.size()) {
idx++;
}
handleExtraArgs(args.subList(idx, args.size()));
} /**
* Callback for when an option with an argument is parsed.
*
* @param opt The long name of the cli option (might differ from actual command line).
* @param value The value. This will be <i>null</i> if the option does not take a value.
* @return Whether to continue parsing the argument list.
*/
protected boolean handle(String opt, String value) {
throw new UnsupportedOperationException();
} /**
* Callback for when an unrecognized option is parsed.
*
* @param opt Unrecognized option from the command line.
* @return Whether to continue parsing the argument list.
*/
protected boolean handleUnknown(String opt) {
throw new UnsupportedOperationException();
} /**
* Callback for remaining command line arguments after either {@link #handle(String, String)} or
* {@link #handleUnknown(String)} return "false". This will be called at the end of parsing even
* when there are no remaining arguments.
*
* @param extra List of remaining arguments.
*/
protected void handleExtraArgs(List<String> extra) {
throw new UnsupportedOperationException();
}
 
private String findCliOption(String name, String[][] available) {
for (String[] candidates : available) {
for (String candidate : candidates) {
if (candidate.equals(name)) {
return candidates[0];
}
}
}
return null;
} }

调用buildCommand方法:

org.apache.spark.launcher.SparkSubmitCommandBuilder#buildCommand

  public List<String> buildCommand(Map<String, String> env)
throws IOException, IllegalArgumentException {
if (PYSPARK_SHELL.equals(appResource) && isAppResourceReq) {
return buildPySparkShellCommand(env);
} else if (SPARKR_SHELL.equals(appResource) && isAppResourceReq) {
return buildSparkRCommand(env);
} else {
return buildSparkSubmitCommand(env);//主要在这里
}
}

org.apache.spark.launcher.SparkSubmitCommandBuilder#buildSparkSubmitCommand

  private List<String> buildSparkSubmitCommand(Map<String, String> env)
throws IOException, IllegalArgumentException {
// Load the properties file and check whether spark-submit will be running the app's driver
// or just launching a cluster app. When running the driver, the JVM's argument will be
// modified to cover the driver's configuration.
Map<String, String> config = getEffectiveConfig();
boolean isClientMode = isClientMode(config);
String extraClassPath = isClientMode ? config.get(SparkLauncher.DRIVER_EXTRA_CLASSPATH) : null; List<String> cmd = buildJavaCommand(extraClassPath);
// Take Thrift Server as daemon
if (isThriftServer(mainClass)) {
addOptionString(cmd, System.getenv("SPARK_DAEMON_JAVA_OPTS"));
}
addOptionString(cmd, System.getenv("SPARK_SUBMIT_OPTS")); // We don't want the client to specify Xmx. These have to be set by their corresponding
// memory flag --driver-memory or configuration entry spark.driver.memory
String driverExtraJavaOptions = config.get(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS);
if (!isEmpty(driverExtraJavaOptions) && driverExtraJavaOptions.contains("Xmx")) {
String msg = String.format("Not allowed to specify max heap(Xmx) memory settings through " +
"java options (was %s). Use the corresponding --driver-memory or " +
"spark.driver.memory configuration instead.", driverExtraJavaOptions);
throw new IllegalArgumentException(msg);
} if (isClientMode) {
// Figuring out where the memory value come from is a little tricky due to precedence.
// Precedence is observed in the following order:
// - explicit configuration (setConf()), which also covers --driver-memory cli argument.
// - properties file.
// - SPARK_DRIVER_MEMORY env variable
// - SPARK_MEM env variable
// - default value (1g)
// Take Thrift Server as daemon
String tsMemory =
isThriftServer(mainClass) ? System.getenv("SPARK_DAEMON_MEMORY") : null;
String memory = firstNonEmpty(tsMemory, config.get(SparkLauncher.DRIVER_MEMORY),
System.getenv("SPARK_DRIVER_MEMORY"), System.getenv("SPARK_MEM"), DEFAULT_MEM);
cmd.add("-Xmx" + memory);
addOptionString(cmd, driverExtraJavaOptions);
mergeEnvPathList(env, getLibPathEnvName(),
config.get(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH));
} cmd.add("org.apache.spark.deploy.SparkSubmit");
cmd.addAll(buildSparkSubmitArgs());
return cmd;
}

脚本中shell相关内容:

1.set -o posix

set命令是shell解释器的一个内置命令,用来设置shell解释器的属性,从而能够控制shell解释器的一些行为。

在set命令中,选项前面跟着 - 号表示开启这个选项, + 表示关闭这个选项。

POSIX,Portable Operating System Interface。
是UNIX系统的一个设计标准,很多类UNIX系统也在支持兼容这个标准,如Linux。
遵循这个标准的好处是软件可以跨平台。
所以windows也支持就很容易理解了,那么多优秀的开源软件,支持了这个这些软件就可能有windows版本,就可以完善丰富windows下的软件。
set -o posix:开启bash的posix模式。

2.command -v java

command [-pVv] command [arg ...]

用command指定可取消正常的shell function寻找。只有内建命令及在PATH中找得到的才会被执行。

"-p"选项,搜寻命令的方式是用PATH来找。"-V"或"-v"选项,会显示出该命令的一些简约描述。

4.read -d

-d :表示delimiter,即定界符,一般情况下是以IFS为参数的间隔,但是通过-d,我们可以定义一直读到出现执行的字符位置。例如read –d madfds value,读到有m的字符的时候就不在继续向后读,例如输入为 hello m,有效值为“hello”,请注意m前面的空格等会被删除。这种方式可以输入多个字符串,例如定义“.”作为结符号等等

read命令 -n(不换行) -p(提示语句) -n(字符个数) -t(等待时间) -s(不回显)

spark-submit脚本分析的更多相关文章

  1. Spark 个人实战系列(2)--Spark 服务脚本分析

    前言: spark最近非常的火热, 本文不讲spark原理, 而是研究spark集群搭建和服务的脚本是如何编写的, 管中窥豹, 希望从运行脚本的角度去理解spark集群. 研究的spark为1.0.1 ...

  2. Spark Submit 脚本

    当我们需要命令行传递参数时候,将--class 写在前面,然后是jar 最后是参数 spark-submit --master yarn --num-executors 3 --executor-me ...

  3. Spark配置&启动脚本分析

    本文档基于Spark2.0,对spark启动脚本进行分析. date:2016/8/3 author:wangxl Spark配置&启动脚本分析 我们主要关注3类文件,配置文件,启动脚本文件以 ...

  4. Spark SQL源代码分析之核心流程

    /** Spark SQL源代码分析系列文章*/ 自从去年Spark Submit 2013 Michael Armbrust分享了他的Catalyst,到至今1年多了,Spark SQL的贡献者从几 ...

  5. Spark源码分析之Spark Shell(下)

    继上次的Spark-shell脚本源码分析,还剩下后面半段.由于上次涉及了不少shell的基本内容,因此就把trap和stty放在这篇来讲述. 上篇回顾:Spark源码分析之Spark Shell(上 ...

  6. Spark源码分析之Spark-submit和Spark-class

    有了前面spark-shell的经验,看这两个脚本就容易多啦.前面总结的Spark-shell的分析可以参考: Spark源码分析之Spark Shell(上) Spark源码分析之Spark She ...

  7. 【原创】大数据基础之Spark(1)Spark Submit即Spark任务提交过程

    Spark2.1.1 一 Spark Submit本地解析 1.1 现象 提交命令: spark-submit --master local[10] --driver-memory 30g --cla ...

  8. Spark源码分析:多种部署方式之间的区别与联系(转)

    原文链接:Spark源码分析:多种部署方式之间的区别与联系(1) 从官方的文档我们可以知道,Spark的部署方式有很多种:local.Standalone.Mesos.YARN.....不同部署方式的 ...

  9. Spark源码分析 -- TaskScheduler

    Spark在设计上将DAGScheduler和TaskScheduler完全解耦合, 所以在资源管理和task调度上可以有更多的方案 现在支持, LocalSheduler, ClusterSched ...

  10. Spark源码分析 – SchedulerBackend

    SchedulerBackend, 两个任务, 申请资源和task执行和管理 对于SparkDeploySchedulerBackend, 基于actor模式, 主要就是启动和管理两个actor De ...

随机推荐

  1. 如何:使用 as 和 is 运算符安全地进行强制转换(C# 编程指南)

    如何:使用 as 和 is 运算符安全地进行强制转换(C# 编程指南) 由于对象是多态的,因此基类类型的变量可以保存派生类型.若要访问派生类型的方法,需要将值强制转换回该派生类型.不过,在这些情况下, ...

  2. python基础面试题1

    Python面试重点(基础篇) 注意:只有必答题部分计算分值,补充题不计算分值. 第一部分 必答题(每题2分) 简述列举了解的编程语言及语言间的区别? c语言是编译型语言,运行速度快,但翻译时间长py ...

  3. css 瀑布流

    瀑布流 <!DOCTYPE html> <html> <head> <meta charset="utf-8" /> <tit ...

  4. canvas的其他应用

    画布的基础知识 专门研究画布的大佬 手动实现echar的大佬 echar官方 画布之水印 ctx.font = "bold 20px Arial"; ctx.lineWidth = ...

  5. k-近邻算法的优缺点及拓展思考

    //2019.08.03晚#k-近邻算法的拓展思考与总结1.k-近邻算法是一种非常典型的分类监督学习算法,它可以解决多分类的问题:另外,它的整体思想简单,效果强大.它也可以用来解决回归问题,使用的库函 ...

  6. Jquery实现横向tab切换

    //需求:鼠标放在不同的导航栏上,下面显示的内容自动切换 //代码如下 <!DOCTYPE html> <html lang="en"> <head& ...

  7. Spring Boot 核心注解与配置文件

    @SpringBootApplication注解 Spring Boot项目有一个入口类 (*Application) 在这个类中有一个main 方法,是运行该项目的切入点.而@SpringBootA ...

  8. 查看 Secret【转】

    可以通过 kubectl get secret 查看存在的 secret. 显示有两个数据条目,kubectl describe secret 查看条目的 Key: 如果还想查看 Value,可以用  ...

  9. VUE - 路由跳转时设置动画效果

    /* 为对应的路由跳转时设置动画效果 */   <transition name="fade">         <router-view />     & ...

  10. python 聚类分析 k均值算法

    dataSet = [ #数据集 # 1 [0.697, 0.460], # 2 [0.774, 0.376], # 3 [0.634, 0.264], # 4 [0.608, 0.318], # 5 ...