Spark2.x(六十):在Structured Streaming流处理中是如何查找kafka的DataSourceProvider?
本章节根据源代码分析Spark Structured Streaming(Spark2.4)在进行DataSourceProvider查找的流程,首先,我们看下读取流数据源kafka的代码:
- SparkSession sparkSession = SparkSession.builder().getOrCreate();
- Dataset<Row> sourceDataset = sparkSession.readStream().format("kafka").option("xxx", "xxx").load();
- @InterfaceStability.Evolving
- final class DataStreamReader private[sql](sparkSession: SparkSession) extends Logging {
- /**
- * Specifies the input data source format.
- *
- * @since 2.0.0
- */
- def format(source: String): DataStreamReader = {
- this.source = source
- this
- }
- 。。。
- }
- /**
- * Loads input data stream in as a `DataFrame`, for data streams that don't require a path
- * (e.g. external key-value stores).
- *
- * @since 2.0.0
- */
- def load(): DataFrame = {
- if (source.toLowerCase(Locale.ROOT) == DDLUtils.HIVE_PROVIDER) {
- throw new AnalysisException("Hive data source can only be used with tables, you can not " +
- "read files of Hive data source directly.")
- }
- val ds = DataSource.lookupDataSource(source, sparkSession.sqlContext.conf).newInstance()
- // We need to generate the V1 data source so we can pass it to the V2 relation as a shim.
- // We can't be sure at this point whether we'll actually want to use V2, since we don't know the
- // writer or whether the query is continuous.
- val v1DataSource = DataSource(
- sparkSession,
- userSpecifiedSchema = userSpecifiedSchema,
- className = source,
- options = extraOptions.toMap)
- val v1Relation = ds match {
- case _: StreamSourceProvider => Some(StreamingRelation(v1DataSource))
- case _ => None
- }
- ds match {
- case s: MicroBatchReadSupport =>
- val sessionOptions = DataSourceV2Utils.extractSessionConfigs(
- ds = s, conf = sparkSession.sessionState.conf)
- val options = sessionOptions ++ extraOptions
- val dataSourceOptions = new DataSourceOptions(options.asJava)
- var tempReader: MicroBatchReader = null
- val schema = try {
- tempReader = s.createMicroBatchReader(
- Optional.ofNullable(userSpecifiedSchema.orNull),
- Utils.createTempDir(namePrefix = s"temporaryReader").getCanonicalPath,
- dataSourceOptions)
- tempReader.readSchema()
- } finally {
- // Stop tempReader to avoid side-effect thing
- if (tempReader != null) {
- tempReader.stop()
- tempReader = null
- }
- }
- Dataset.ofRows(
- sparkSession,
- StreamingRelationV2(
- s, source, options,
- schema.toAttributes, v1Relation)(sparkSession))
- case s: ContinuousReadSupport =>
- val sessionOptions = DataSourceV2Utils.extractSessionConfigs(
- ds = s, conf = sparkSession.sessionState.conf)
- val options = sessionOptions ++ extraOptions
- val dataSourceOptions = new DataSourceOptions(options.asJava)
- val tempReader = s.createContinuousReader(
- Optional.ofNullable(userSpecifiedSchema.orNull),
- Utils.createTempDir(namePrefix = s"temporaryReader").getCanonicalPath,
- dataSourceOptions)
- Dataset.ofRows(
- sparkSession,
- StreamingRelationV2(
- s, source, options,
- tempReader.readSchema().toAttributes, v1Relation)(sparkSession))
- case _ =>
- // Code path for data source v1.
- Dataset.ofRows(sparkSession, StreamingRelation(v1DataSource))
- }
- }
val ds=DataSoruce.lookupDataSource(source ,….).newInstance()用到了该source变量,要想知道ds是什么(Dataset还是其他),需要查看DataSource.lookupDataSource(source,。。。)方法实现。
DataSource.lookupDataSource(source, sparkSession.sqlContext.conf)解析
- object DataSource extends Logging {
- 。。。。。/**
- * Class that were removed in Spark 2.0. Used to detect incompatibility libraries for Spark 2.0.
- */
- private val spark2RemovedClasses = Set(
- "org.apache.spark.sql.DataFrame",
- "org.apache.spark.sql.sources.HadoopFsRelationProvider",
- "org.apache.spark.Logging")
- /** Given a provider name, look up the data source class definition. */
- def lookupDataSource(provider: String, conf: SQLConf): Class[_] = {
- val provider1 = backwardCompatibilityMap.getOrElse(provider, provider) match {
- case name if name.equalsIgnoreCase("orc") &&
- conf.getConf(SQLConf.ORC_IMPLEMENTATION) == "native" =>
- classOf[OrcFileFormat].getCanonicalName
- case name if name.equalsIgnoreCase("orc") &&
- conf.getConf(SQLConf.ORC_IMPLEMENTATION) == "hive" =>
- "org.apache.spark.sql.hive.orc.OrcFileFormat"
- case "com.databricks.spark.avro" if conf.replaceDatabricksSparkAvroEnabled =>
- "org.apache.spark.sql.avro.AvroFileFormat"
- case name => name
- }
- val provider2 = s"$provider1.DefaultSource"
- val loader = Utils.getContextOrSparkClassLoader
- val serviceLoader = ServiceLoader.load(classOf[DataSourceRegister], loader)
- try {
- serviceLoader.asScala.filter(_.shortName().equalsIgnoreCase(provider1)).toList match {
- // the provider format did not match any given registered aliases
- case Nil =>
- try {
- Try(loader.loadClass(provider1)).orElse(Try(loader.loadClass(provider2))) match {
- case Success(dataSource) =>
- // Found the data source using fully qualified path
- dataSource
- case Failure(error) =>
- if (provider1.startsWith("org.apache.spark.sql.hive.orc")) {
- throw new AnalysisException(
- "Hive built-in ORC data source must be used with Hive support enabled. " +
- "Please use the native ORC data source by setting 'spark.sql.orc.impl' to " +
- "'native'")
- } else if (provider1.toLowerCase(Locale.ROOT) == "avro" ||
- provider1 == "com.databricks.spark.avro" ||
- provider1 == "org.apache.spark.sql.avro") {
- throw new AnalysisException(
- s"Failed to find data source: $provider1. Avro is built-in but external data " +
- "source module since Spark 2.4. Please deploy the application as per " +
- "the deployment section of \"Apache Avro Data Source Guide\".")
- } else if (provider1.toLowerCase(Locale.ROOT) == "kafka") {
- throw new AnalysisException(
- s"Failed to find data source: $provider1. Please deploy the application as " +
- "per the deployment section of " +
- "\"Structured Streaming + Kafka Integration Guide\".")
- } else {
- throw new ClassNotFoundException(
- s"Failed to find data source: $provider1. Please find packages at " +
- "",
- error)
- }
- }
- } catch {
- case e: NoClassDefFoundError => // This one won't be caught by Scala NonFatal
- // NoClassDefFoundError's class name uses "/" rather than "." for packages
- val className = e.getMessage.replaceAll("/", ".")
- if (spark2RemovedClasses.contains(className)) {
- throw new ClassNotFoundException(s"$className was removed in Spark 2.0. " +
- "Please check if your library is compatible with Spark 2.0", e)
- } else {
- throw e
- }
- }
- case head :: Nil =>
- // there is exactly one registered alias
- head.getClass
- case sources =>
- // There are multiple registered aliases for the input. If there is single datasource
- // that has "org.apache.spark" package in the prefix, we use it considering it is an
- // internal datasource within Spark.
- val sourceNames =
- val internalSources = sources.filter(_.getClass.getName.startsWith("org.apache.spark"))
- if (internalSources.size == 1) {
- logWarning(s"Multiple sources found for $provider1 (${sourceNames.mkString(", ")}), " +
- s"defaulting to the internal datasource (${internalSources.head.getClass.getName}).")
- internalSources.head.getClass
- } else {
- throw new AnalysisException(s"Multiple sources found for $provider1 " +
- s"(${sourceNames.mkString(", ")}), please specify the fully qualified class name.")
- }
- }
- } catch {
- case e: ServiceConfigurationError if e.getCause.isInstanceOf[NoClassDefFoundError] =>
- // NoClassDefFoundError's class name uses "/" rather than "." for packages
- val className = e.getCause.getMessage.replaceAll("/", ".")
- if (spark2RemovedClasses.contains(className)) {
- throw new ClassNotFoundException(s"Detected an incompatible DataSourceRegister. " +
- "Please remove the incompatible library from classpath or upgrade it. " +
- s"Error: ${e.getMessage}", e)
- } else {
- throw e
- }
- }
- }
- 、、、
- }
1)优先从object DataSource预定义backwardCompatibilityMap中查找provider;
- object DataSource extends Logging {
- /** A map to maintain backward compatibility in case we move data sources around. */
- private val backwardCompatibilityMap: Map[String, String] = {
- val jdbc = classOf[JdbcRelationProvider].getCanonicalName
- val json = classOf[JsonFileFormat].getCanonicalName
- val parquet = classOf[ParquetFileFormat].getCanonicalName
- val csv = classOf[CSVFileFormat].getCanonicalName
- val libsvm = ""
- val orc = "org.apache.spark.sql.hive.orc.OrcFileFormat"
- val nativeOrc = classOf[OrcFileFormat].getCanonicalName
- val socket = classOf[TextSocketSourceProvider].getCanonicalName
- val rate = classOf[RateStreamProvider].getCanonicalName
- Map(
- "org.apache.spark.sql.jdbc" -> jdbc,
- "org.apache.spark.sql.jdbc.DefaultSource" -> jdbc,
- "org.apache.spark.sql.execution.datasources.jdbc.DefaultSource" -> jdbc,
- "org.apache.spark.sql.execution.datasources.jdbc" -> jdbc,
- "org.apache.spark.sql.json" -> json,
- "org.apache.spark.sql.json.DefaultSource" -> json,
- "org.apache.spark.sql.execution.datasources.json" -> json,
- "org.apache.spark.sql.execution.datasources.json.DefaultSource" -> json,
- "org.apache.spark.sql.parquet" -> parquet,
- "org.apache.spark.sql.parquet.DefaultSource" -> parquet,
- "org.apache.spark.sql.execution.datasources.parquet" -> parquet,
- "org.apache.spark.sql.execution.datasources.parquet.DefaultSource" -> parquet,
- "org.apache.spark.sql.hive.orc.DefaultSource" -> orc,
- "org.apache.spark.sql.hive.orc" -> orc,
- "org.apache.spark.sql.execution.datasources.orc.DefaultSource" -> nativeOrc,
- "org.apache.spark.sql.execution.datasources.orc" -> nativeOrc,
- "" -> libsvm,
- "" -> libsvm,
- "com.databricks.spark.csv" -> csv,
- "org.apache.spark.sql.execution.streaming.TextSocketSourceProvider" -> socket,
- "org.apache.spark.sql.execution.streaming.RateSourceProvider" -> rate
- )
- }
- 。。。
- }
- /**
- * The provider class for all Kafka readers and writers. It is designed such that it throws
- * IllegalArgumentException when the Kafka Dataset is created, so that it can catch
- * missing options even before the query is started.
- */
- private[kafka010] class KafkaSourceProvider extends DataSourceRegister
- with StreamSourceProvider
- with StreamSinkProvider
- with RelationProvider
- with CreatableRelationProvider
- with TableProvider
- with Logging {
- import KafkaSourceProvider._
- override def shortName(): String = "kafka"
- 。。。。
- }
- /**
- * Data sources should implement this trait so that they can register an alias to their data source.
- * This allows users to give the data source alias as the format type over the fully qualified
- * class name.
- *
- * A new instance of this class will be instantiated each time a DDL call is made.
- *
- * @since 1.5.0
- */
- @InterfaceStability.Stable
- trait DataSourceRegister {
- /**
- * The string that represents the format that this data source provider uses. This is
- * overridden by children to provide a nice alias for the data source. For example:
- *
- * {{{
- * override def shortName(): String = "parquet"
- * }}}
- *
- * @since 1.5.0
- */
- def shortName(): String
- }
