sparkSQL中的example学习(1)

SparkSQLDemo.scala



import org.apache.spark.sql.{Row, SparkSession}

import org.apache.spark.sql.types.{StringType, StructField, StructType}

object SparkSQLDemo {

  // $example on:create_ds$

  case class Person(name: String, age: Long)

  // $example on:create_ds$

  def main(args: Array[String]): Unit = {

    //开启SparkSession

    //    $example on: init_session$

    val spark = SparkSession

      .builder()

      .appName("SparkSQLDemo")

      .master("local")

      .config("spark.some.config.option", "some-value")

      .getOrCreate()

//    $example off: init_session$

//    runBasicDataFrameDemo(spark)

//    runDatasetCreationDemo(spark)

//    runInferSchemaDemo(spark)

    runProgrammaticSchemaDemo(spark)

    //关闭SparkSeesion

    spark.stop()

  }

  private def runBasicDataFrameDemo(spark: SparkSession) = {

    val df = spark.read.json("/Users/hadoop/app/spark/examples/src/main/resources/people.json")

    //Displays the content of the DataFrame to stdout

    df.show()

    //Print the schema in a tree format

    df.printSchema()

    //Select only the "name" column

    df.select("name").show()

    //This import is needed to use the $-notation

    import spark.implicits._

    df.select($"name", $"age" + 1).show()

    //Select people older than 21

    df.select($"age" > 21).show()

    //Count people by age

    df.groupBy("age").count().show()

    //$example on: global_temp_view$

    //Register the DataFrame as a SQL temporary view

    df.createOrReplaceTempView("people")

    val sqlDF = spark.sql("select * from people")

    sqlDF.show()

    //Register the DataFrame as a global temporary view

    df.createGlobalTempView("people")

    //Global temporary view is tied to a system preserved database `global_temp`

    spark.sql("select * from global_temp.people").show

    //Global temporary view is cross-session

    spark.newSession().sql("select * from global_temp.people").show()

  }

  private def runDatasetCreationDemo(spark: SparkSession) = {

//    A container for a [[Dataset]], used for implicit conversions in Scala.

//    To use this, import implicit conversions in SQL:

    import spark.implicits._

    // .toDS() -> 这是用括号声明的，以防止Scala编译器将`rdd.toDS（“1”）`视为调用此toDS然后应用于返回的数据集。

    //Encoder are created for case classes (为case class 创建编码器)

    val caseClassDS = Seq(Person("Andy", 32)).toDS()

    caseClassDS.show()

    //Encoders for most common types are automatically provided by importing spark.implicits._

    val primitiveDS = Seq(1, 2, 3).toDS()

    primitiveDS.map(_ + 1).foreach(println(_))//.collect()

    //DataFrames can be converted to a Dataset by providing a class. Mapping will bedone by name

    val path = "/Users/hadoop/app/spark/examples/src/main/resources/people.json"

    val peopleDS = spark.read.json(path).as[Person]

    peopleDS.show()

  }

  private def runInferSchemaDemo(spark: SparkSession) = {

//    $example on: schema_inferring$

    //For implicit conversions from RDDs to DataFrames

    import spark.implicits._

    //Create an RDD of Person objects from a text file, convert it to a DataFrame

    val peopleDF = spark.sparkContext

      .textFile("/Users/hadoop/app/spark/examples/src/main/resources/people.txt")

      .map(_.split(","))

      .map(x => Person(x(0), x(1).trim.toInt))

      .toDF()

    //Register the DataFrame as a temporary view

    peopleDF.createOrReplaceTempView("people")

    //SQL statements can be run by using the sql methods provided by Spark

    val teenagersDF = spark.sql("select name, age from people where age between 13 and 19")

    //The columns of a row in the result can be accessed by field index

    //(结果中的行的列可以通过字段索引访问)

    teenagersDF.map(teenager => s"Name: ${teenager(0)}").show()

    //or by field name

    teenagersDF.map(teenager => s"Name: ${teenager.getAs[String]("name")}").show()

    //No pre-defined encoders for Dataset[Map[K,V]], define explicitly

    //(Dataset[Map[K,V]] 没有预定义的编码器, 显式定义)

    implicit val mapEncoder = org.apache.spark.sql.Encoders.kryo[Map[String, Any]]

    //Primitive types and case classes can be also defined as

    //(原始类型和case类也可以定义为隐式val )

    //implicit val stringIntMapEncoder: Encoder[Map[String, Any]] = ExpressionEncoder()

    //row.getValuesMap[T] retrieves multiple columns at once into a Map[String, T]

    teenagersDF.map(teenager =>

      teenager.getValuesMap[Any](List("name", "age"))

    ).foreach(println(_))//.collect()

//    $example off: schema_inferring$

  }

  private def runProgrammaticSchemaDemo(spark: SparkSession) = {

    import spark.implicits._

//    $example on: programmatic_schema$

    //Create an RDD

    val peopleRDD = spark.sparkContext.textFile("/Users/hadoop/app/spark/examples/src/main/resources/people.txt")

    //The schema is encoded in a string

    val schemaString = "name age"

    //Generate the schema based on the string of schema

    val fields = schemaString.split(" ")

      .map(fieldName => StructField(fieldName, StringType, nullable = true))

    val schema = StructType(fields)

    //Convert records of the RDD (people) to Rows

    val rowRDD = peopleRDD

      .map(_.split(","))

      .map(attributes => Row(attributes(0), attributes(1).trim))

    //Apply the schema to the RDD

    val peopleDF = spark.createDataFrame(rowRDD, schema)

    //Creates a temporary view using the DataFrame

    peopleDF.createOrReplaceTempView("people")

    //SQL can be run over a temporary view created using DataFrames

    val results = spark.sql("select name from people")

    //The results of SQL queries are DataFrames and support all the normal RDD operations

    //The columns of a row in the result can be accessed by field index or by field name

    results.map(attributes => s"Name: ${attributes(0)}").show()

//    $exmaple off: programmatic_schema$

  }

}

sparkSQL中的example学习(1)的更多相关文章

sparkSQL中的example学习(3)
UserDefinedTypedAggregation.scala(用户可自定义类型) import org.apache.spark.sql.expressions.Aggregator impor ...
sparkSQL中的example学习(2)
UserDefinedUntypedAggregate.scala(默认返回类型为空,不能更改) import org.apache.spark.sql.{Row, SparkSession} imp ...
PHP中的Libevent学习
wangbin@2012,1,3 目录 Libevent在php中的应用学习 1. Libevent介绍 2. 为什么要学习libevent 3. Php libeven ...
JS中childNodes深入学习
原文:JS中childNodes深入学习 <html xmlns="http://www.w3.org/1999/xhtml"> <head> <ti ...
CNCC2017中的深度学习与跨媒体智能
CNCC2017中的深度学习与跨媒体智能转载请注明作者:梦里茶目录机器学习与跨媒体智能传统方法与深度学习图像分割小数据集下的深度学习语音前沿技术生成模型基于贝叶斯的视觉信息编解码珠 ...
【Spark篇】---SparkSQL中自定义UDF和UDAF，开窗函数的应用
一.前述 SparkSQL中的UDF相当于是1进1出,UDAF相当于是多进一出,类似于聚合函数. 开窗函数一般分组取topn时常用. 二.UDF和UDAF函数 1.UDF函数 java代码: Spar ...
图解BERT（NLP中的迁移学习）
目录一.例子:句子分类二.模型架构模型的输入模型的输出三.与卷积网络并行四.嵌入表示的新时代回顾一下词嵌入 ELMo: 语境的重要性五.ULM-FiT:搞懂NLP中的迁移学习六.Tr ...
python中confIgparser模块学习
python中configparser模块学习 ConfigParser模块在python中用来读取配置文件,配置文件的格式跟windows下的ini配置文件相似,可以包含一个或多个节(section ...
Scala中的类学习
Scala中的类学习从java了解类的情况下,了解Scala的类并不难.Scala类中的字段自动带getter和setter方法,用@BeanProperty注解生成javaBean对象的getXX ...

随机推荐

shell脚本一次性将tab制表符改为4空格的方法
问题描述: 今天需要修改一些bash脚本,因为考虑到pycharm里面能够直接写,而我用pycharm比较多,所以直接用pycharm写了,由于改的那个bash脚本是别的同事写的,里面的缩进都是用的T ...
Windows远程桌面多用户登录的问题
RDP WRAPPER 同时登录多用户补丁 https://cloud.tencent.com/developer/article/1460728 解决系统更新导致无法多用户登录的问题问题 ...
13.Java基础_数组内存图
单个数组内存图 new int[3]: 在堆内存里申请一块空间存储int类型的变量(初始化时值都为0) int[] array: 在栈内存申请一块内存存储堆内存里数组的首地址 array[i]: 通过 ...
爬虫scrapy模块
首先下载scrapy模块这里有惊喜 https://www.cnblogs.com/bobo-zhang/p/10068997.html 创建一个scrapy文件首先在终端找到一个文件夹输入 s ...
牛客OI周赛13-提高组-0还是1-（dp+位运算）
https://ac.nowcoder.com/acm/contest/2970/A 给出长度为n的一连串位运算符号,用n+1个0或1使运算插入最后得到1,求01序列有多少种可能. dp[i][j]表 ...
使用threaddump-analyzer 快速查看jvm thread 状态信息
日常开发中,我们可以需要通过thread dump 查看线程信息,比如锁,spotify 团队提供了一个web 界面,很方便以下是简单使用,同时添加了docker 支持添加docker 支持 cl ...
SLAM中的非线性优化
总结一下SLAM中关于非线性优化的知识. 先列出参考: http://jacoxu.com/jacobian%E7%9F%A9%E9%98%B5%E5%92%8Chessian%E7%9F%A9%E9 ...
Python连载18-closure闭包解释及其注意点
一.闭包 1.定义:当一个函数在内部定义函数,并且内部的函数应用外部函数的参数或者局部变量,当内部函数被当做返回值的时候,相关参数和变量保存在返回的函数之中,这种结果,叫做闭包. 2.例子:连载17中 ...
windows xp 安装后不能能ping，浏览器不能上网
windows xp MSDN版本下载地址: ed2k://|file|zh-hans_windows_xp_home_with_service_pack_3_x86_cd_x14-92408.is ...
LeetCode 155：最小栈 Min Stack
LeetCode 155:最小栈 Min Stack 设计一个支持 push,pop,top 操作,并能在常数时间内检索到最小元素的栈. push(x) -- 将元素 x 推入栈中. pop() -- ...

sparkSQL中的example学习(1)

sparkSQL中的example学习(1)的更多相关文章

随机推荐

热门专题