spark 源码分析之二 -- SparkContext 的初始化过程

创建或使用现有Session

从Spark 2.0 开始，引入了 SparkSession的概念，创建或使用已有的session 代码如下：

 val spark = SparkSession

   .builder

   .appName("SparkTC")

   .getOrCreate()

首先，使用了 builder 模式来创建或使用已存在的SparkSession，org.apache.spark.sql.SparkSession.Builder#getOrCreate 代码如下：

 def getOrCreate(): SparkSession = synchronized {

   assertOnDriver() // 注意，spark session只能在 driver端创建并访问

   // Get the session from current thread's active session.

 // activeThreadSession 是一个InheritableThreadLocal（继承自ThreadLocal）方法。因为数据在 ThreadLocal中存放着，所以不需要加锁

   var session = activeThreadSession.get()

 // 如果session不为空，且session对应的sparkContext已经停止了，可以使用现有的session

   if ((session ne null) && !session.sparkContext.isStopped) {

     options.foreach { case (k, v) => session.sessionState.conf.setConfString(k, v) }

     if (options.nonEmpty) {

       logWarning("Using an existing SparkSession; some configuration may not take effect.")

     }

     return session

   }

   // 给SparkSession 对象加锁，防止重复初始化 session

 SparkSession.synchronized {

     // If the current thread does not have an active session, get it from the global session.

 // 如果默认session 中有session存在，切其sparkContext 已经停止，也可以使用

     session = defaultSession.get()

     if ((session ne null) && !session.sparkContext.isStopped) {

       options.foreach { case (k, v) => session.sessionState.conf.setConfString(k, v) }

       if (options.nonEmpty) {

         logWarning("Using an existing SparkSession; some configuration may not take effect.")

       }

       return session

     }

     // 创建session

     val sparkContext = userSuppliedContext.getOrElse { // 默认userSuppliedContext肯定没有SparkSession对象

       val sparkConf = new SparkConf()

       options.foreach { case (k, v) => sparkConf.set(k, v) }

       // set a random app name if not given.

       if (!sparkConf.contains("spark.app.name")) {

         sparkConf.setAppName(java.util.UUID.randomUUID().toString)

       }

       SparkContext.getOrCreate(sparkConf)

       // Do not update `SparkConf` for existing `SparkContext`, as it's shared by all sessions.

     }

     // Initialize extensions if the user has defined a configurator class.

     val extensionConfOption = sparkContext.conf.get(StaticSQLConf.SPARK_SESSION_EXTENSIONS)

     if (extensionConfOption.isDefined) {

       val extensionConfClassName = extensionConfOption.get

       try {

         val extensionConfClass = Utils.classForName(extensionConfClassName)

         val extensionConf = extensionConfClass.newInstance()

           .asInstanceOf[SparkSessionExtensions => Unit]

         extensionConf(extensions)

       } catch {

         // Ignore the error if we cannot find the class or when the class has the wrong type.

         case e @ (_: ClassCastException |

                   _: ClassNotFoundException |

                   _: NoClassDefFoundError) =>

           logWarning(s"Cannot use $extensionConfClassName to configure session extensions.", e)

       }

     }

    // 初始化 SparkSession，并把刚初始化的 SparkContext 传递给它

     session = new SparkSession(sparkContext, None, None, extensions)

     options.foreach { case (k, v) => session.initialSessionOptions.put(k, v) }

 // 设置 default session

     setDefaultSession(session)

 // 设置 active session

 setActiveSession(session)

     // Register a successfully instantiated context to the singleton. This should be at the

     // end of the class definition so that the singleton is updated only if there is no

     // exception in the construction of the instance.

     // 设置 apark listener ，当application 结束时，default session 重置

 sparkContext.addSparkListener(new SparkListener {

       override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = {

         defaultSession.set(null)

       }

     })

   }

   return session

 }

org.apache.spark.SparkContext#getOrCreate方法如下：

 def getOrCreate(config: SparkConf): SparkContext = {

   // Synchronize to ensure that multiple create requests don't trigger an exception

   // from assertNoOtherContextIsRunning within setActiveContext

 // 使用Object 对象锁

   SPARK_CONTEXT_CONSTRUCTOR_LOCK.synchronized {

 // activeContext是一个AtomicReference 实例，它的数据set或update都是原子性的

     if (activeContext.get() == null) {

 // 一个session 只有一个 SparkContext 上下文对象

       setActiveContext(new SparkContext(config), allowMultipleContexts = false)

     } else {

       if (config.getAll.nonEmpty) {

         logWarning("Using an existing SparkContext; some configuration may not take effect.")

       }

     }

     activeContext.get()

   }

 }

Spark Context 初始化

SparkContext 代表到 spark 集群的连接，它可以用来在spark集群上创建 RDD，accumulator和broadcast 变量。一个JVM 只能有一个活动的 SparkContext 对象，当创建一个新的时候，必须调用stop 方法停止活动的 SparkContext。
当调用了构造方法后，会初始化类的成员变量，然后进入初始化过程。由 try catch 块包围，这个 try catch 块是在执行构造函数时执行的，参照我写的一篇文章：scala class中孤立代码块揭秘

这块孤立的代码块如下：　　

 try {

   // 1. 初始化 configuration

   _conf = config.clone()

   _conf.validateSettings()

   if (!_conf.contains("spark.master")) {

     throw new SparkException("A master URL must be set in your configuration")

   }

   if (!_conf.contains("spark.app.name")) {

     throw new SparkException("An application name must be set in your configuration")

   }

   // log out spark.app.name in the Spark driver logs

   logInfo(s"Submitted application: $appName")

   // System property spark.yarn.app.id must be set if user code ran by AM on a YARN cluster

   if (master == "yarn" && deployMode == "cluster" && !_conf.contains("spark.yarn.app.id")) {

     throw new SparkException("Detected yarn cluster mode, but isn't running on a cluster. " +

       "Deployment to YARN is not supported directly by SparkContext. Please use spark-submit.")

   }

   if (_conf.getBoolean("spark.logConf", false)) {

     logInfo("Spark configuration:\n" + _conf.toDebugString)

   }

   // Set Spark driver host and port system properties. This explicitly sets the configuration

   // instead of relying on the default value of the config constant.

   _conf.set(DRIVER_HOST_ADDRESS, _conf.get(DRIVER_HOST_ADDRESS))

   _conf.setIfMissing("spark.driver.port", "0")

   _conf.set("spark.executor.id", SparkContext.DRIVER_IDENTIFIER)

   _jars = Utils.getUserJars(_conf)

   _files = _conf.getOption("spark.files").map(_.split(",")).map(_.filter(_.nonEmpty))

     .toSeq.flatten

   // 2. 初始化日志目录并设置压缩类

   _eventLogDir =

     if (isEventLogEnabled) {

       val unresolvedDir = conf.get("spark.eventLog.dir", EventLoggingListener.DEFAULT_LOG_DIR)

         .stripSuffix("/")

       Some(Utils.resolveURI(unresolvedDir))

     } else {

       None

     }

   _eventLogCodec = {

     val compress = _conf.getBoolean("spark.eventLog.compress", false)

     if (compress && isEventLogEnabled) {

       Some(CompressionCodec.getCodecName(_conf)).map(CompressionCodec.getShortName)

     } else {

       None

     }

   }

   // 3. LiveListenerBus负责将SparkListenerEvent异步地传递给对应注册的SparkListener.

   _listenerBus = new LiveListenerBus(_conf)

   // Initialize the app status store and listener before SparkEnv is created so that it gets

   // all events.

   // 4. 给 app 提供一个 kv store（in-memory）

   _statusStore = AppStatusStore.createLiveStore(conf)

   // 5. 注册 AppStatusListener 到 LiveListenerBus 中

   listenerBus.addToStatusQueue(_statusStore.listener.get)

   // Create the Spark execution environment (cache, map output tracker, etc)

   // 6. 创建 driver端的 env

   // 包含所有的spark 实例运行时对象（master 或 worker），包含了序列化器，RPCEnv，block manager， map out tracker等等。

   // 当前的spark 通过一个全局的变量代码找到 SparkEnv，所有的线程可以访问同一个SparkEnv，

   // 创建SparkContext之后，可以通过 SparkEnv.get方法来访问它。

   _env = createSparkEnv(_conf, isLocal, listenerBus)

   SparkEnv.set(_env)

   // If running the REPL, register the repl's output dir with the file server.

   _conf.getOption("spark.repl.class.outputDir").foreach { path =>

     val replUri = _env.rpcEnv.fileServer.addDirectory("/classes", new File(path))

     _conf.set("spark.repl.class.uri", replUri)

   }

   // 7. 从底层监控 spark job 和 stage 的状态并汇报的 API

   _statusTracker = new SparkStatusTracker(this, _statusStore)

   // 8. console 进度条

   _progressBar =

     if (_conf.get(UI_SHOW_CONSOLE_PROGRESS) && !log.isInfoEnabled) {

       Some(new ConsoleProgressBar(this))

     } else {

       None

     }

   // 9. spark ui, 使用jetty 实现

   _ui =

     if (conf.getBoolean("spark.ui.enabled", true)) {

       Some(SparkUI.create(Some(this), _statusStore, _conf, _env.securityManager, appName, "",

         startTime))

     } else {

       // For tests, do not enable the UI

       None

     }

   // Bind the UI before starting the task scheduler to communicate

   // the bound port to the cluster manager properly

   _ui.foreach(_.bind())

   // 10. 创建 hadoop configuration

   _hadoopConfiguration = SparkHadoopUtil.get.newConfiguration(_conf)

   // 11. Add each JAR given through the constructor

   if (jars != null) {

     jars.foreach(addJar)

   }

   if (files != null) {

     files.foreach(addFile)

   }

   // 12. 计算 executor 的内存

   _executorMemory = _conf.getOption("spark.executor.memory")

     .orElse(Option(System.getenv("SPARK_EXECUTOR_MEMORY")))

     .orElse(Option(System.getenv("SPARK_MEM"))

     .map(warnSparkMem))

     .map(Utils.memoryStringToMb)

     .getOrElse(1024)

   // Convert java options to env vars as a work around

   // since we can't set env vars directly in sbt.

   for { (envKey, propKey) <- Seq(("SPARK_TESTING", "spark.testing"))

     value <- Option(System.getenv(envKey)).orElse(Option(System.getProperty(propKey)))} {

     executorEnvs(envKey) = value

   }

   Option(System.getenv("SPARK_PREPEND_CLASSES")).foreach { v =>

     executorEnvs("SPARK_PREPEND_CLASSES") = v

   }

   // The Mesos scheduler backend relies on this environment variable to set executor memory.

   // TODO: Set this only in the Mesos scheduler.

   executorEnvs("SPARK_EXECUTOR_MEMORY") = executorMemory + "m"

   executorEnvs ++= _conf.getExecutorEnv

   executorEnvs("SPARK_USER") = sparkUser

   // We need to register "HeartbeatReceiver" before "createTaskScheduler" because Executor will

   // retrieve "HeartbeatReceiver" in the constructor. (SPARK-6640)

   // 13. 创建 HeartbeatReceiver endpoint

   _heartbeatReceiver = env.rpcEnv.setupEndpoint(

     HeartbeatReceiver.ENDPOINT_NAME, new HeartbeatReceiver(this))

   // Create and start the scheduler

   // 14. 创建 task scheduler 和 scheduler backend

   val (sched, ts) = SparkContext.createTaskScheduler(this, master, deployMode)

   _schedulerBackend = sched

   _taskScheduler = ts

   // 15. 创建DAGScheduler实例

   _dagScheduler = new DAGScheduler(this)

   _heartbeatReceiver.ask[Boolean](TaskSchedulerIsSet)

   // start TaskScheduler after taskScheduler sets DAGScheduler reference in DAGScheduler's

   // constructor

   // 16. 启动 task scheduler

   _taskScheduler.start()

   // 17. 从task scheduler 获取 application ID

   _applicationId = _taskScheduler.applicationId()

   // 18. 从 task scheduler 获取 application attempt id

   _applicationAttemptId = taskScheduler.applicationAttemptId()

   _conf.set("spark.app.id", _applicationId)

   if (_conf.getBoolean("spark.ui.reverseProxy", false)) {

     System.setProperty("spark.ui.proxyBase", "/proxy/" + _applicationId)

   }

   // 19. 为ui 设置 application id

   _ui.foreach(_.setAppId(_applicationId))

   // 20. 初始化 block manager

   _env.blockManager.initialize(_applicationId)

   // The metrics system for Driver need to be set spark.app.id to app ID.

   // So it should start after we get app ID from the task scheduler and set spark.app.id.

   // 21. 启动 metricsSystem

   _env.metricsSystem.start()

   // Attach the driver metrics servlet handler to the web ui after the metrics system is started.

   // 22. 将 metricSystem 的 servlet handler 给 ui 用

   _env.metricsSystem.getServletHandlers.foreach(handler => ui.foreach(_.attachHandler(handler)))

   // 23. 初始化 event logger listener

   _eventLogger =

     if (isEventLogEnabled) {

       val logger =

         new EventLoggingListener(_applicationId, _applicationAttemptId, _eventLogDir.get,

           _conf, _hadoopConfiguration)

       logger.start()

       listenerBus.addToEventLogQueue(logger)

       Some(logger)

     } else {

       None

     }

   // Optionally scale number of executors dynamically based on workload. Exposed for testing.

   // 24. 如果启用了动态分配 executor， 需要实例化 executorAllocationManager 并启动之

   val dynamicAllocationEnabled = Utils.isDynamicAllocationEnabled(_conf)

   _executorAllocationManager =

     if (dynamicAllocationEnabled) {

       schedulerBackend match {

         case b: ExecutorAllocationClient =>

           Some(new ExecutorAllocationManager(

             schedulerBackend.asInstanceOf[ExecutorAllocationClient], listenerBus, _conf,

             _env.blockManager.master))

         case _ =>

           None

       }

     } else {

       None

     }

   _executorAllocationManager.foreach(_.start())

   // 25. 初始化 ContextCleaner，并启动之

   _cleaner =

     if (_conf.getBoolean("spark.cleaner.referenceTracking", true)) {

       Some(new ContextCleaner(this))

     } else {

       None

     }

   _cleaner.foreach(_.start())

   // 26. 建立并启动 listener bus

   setupAndStartListenerBus()

   // 27.  task scheduler 已就绪，发送环境已更新请求

   postEnvironmentUpdate()

   // 28.  发送 application start 请求事件

   postApplicationStart()

   // Post init

   // 29.等待 直至task scheduler backend 准备好了

   _taskScheduler.postStartHook()

   // 30. 注册 dagScheduler metricsSource

   _env.metricsSystem.registerSource(_dagScheduler.metricsSource)

   // 31. 注册 metric source

   _env.metricsSystem.registerSource(new BlockManagerSource(_env.blockManager))

   //32. 注册 metric source

   _executorAllocationManager.foreach { e =>

     _env.metricsSystem.registerSource(e.executorAllocationManagerSource)

   }

   // Make sure the context is stopped if the user forgets about it. This avoids leaving

   // unfinished event logs around after the JVM exits cleanly. It doesn't help if the JVM

   // is killed, though.

   logDebug("Adding shutdown hook") // force eager creation of logger

   // 33. 设置 shutdown hook， 在spark context 关闭时，要做的回调操作

   _shutdownHookRef = ShutdownHookManager.addShutdownHook(

     ShutdownHookManager.SPARK_CONTEXT_SHUTDOWN_PRIORITY) { () =>

     logInfo("Invoking stop() from shutdown hook")

     try {

       stop()

     } catch {

       case e: Throwable =>

         logWarning("Ignoring Exception while stopping SparkContext from shutdown hook", e)

     }

   }

 } catch {

   case NonFatal(e) =>

     logError("Error initializing SparkContext.", e)

     try {

       stop()

     } catch {

       case NonFatal(inner) =>

         logError("Error stopping SparkContext after init error.", inner)

     } finally {

       throw e

     }

 }

从上面可以看出，spark context 的初始化是非常复杂的，涉及的spark 组件很多，包括异步事务总线系统LiveListenerBus、SparkEnv、SparkUI、DAGScheduler、metrics监测系统、EventLoggingListener、TaskScheduler、ExecutorAllocationManager、ContextCleaner等等。先暂且当作是总述，后面对部分组件会有比较全面的剖析。