SparkSQL读写外部数据源-json文件的读写
- object JsonFileTest {
- def main(args: Array[String]): Unit = {
- val spark = SparkSession
- .builder()
- .master("local")
- .appName("JsonFileTest")
- .getOrCreate()
- import spark.implicits._
- //将parquet文件数据转化成json文件数据
- val sessionDf = spark.read.parquet(s"${BASE_PATH}/trackerSession")
- sessionDf.show()
- sessionDf.write.json(s"${BASE_PATH}/json")
- //读取json文件数据
- val jsonDF = spark.read.json(s"${BASE_PATH}/json")
- jsonDF.show()
- //可以从JSON Dataset(类型为String)中创建一个DF
- val jsonDataset = spark.createDataset(
- """{"name":"Yin","address":{"city":"Columbus","state":"Ohio"}}""" :: Nil)
- val otherJsonDF = spark.read.json(jsonDataset)
- otherJsonDF.show()
- //primitivesAsString(默认为false) 表示将基本类型转化为string类型,这里的基本类型包括:boolean、int、long、float、double
- //prefersDecimal(默认是false)表示在primitivesAsString为false的时候,将float,double转成DecimalType
- val jsonDataset_1 = spark.createDataset(
- """{"name":"Yin","address":{"is_old":true,"area":23000.34}}""" :: Nil)
- var otherJsonDF_1 = spark.read.json(jsonDataset_1)
- otherJsonDF_1.printSchema()
- /*
- root
- |-- address: struct (nullable = true)
- | |-- area: double (nullable = true)
- | |-- is_old: boolean (nullable = true)
- |-- name: string (nullable = true)
- */
- var optsMap = Map("primitivesAsString" -> "true", "prefersDecimal" -> "true")
- otherJsonDF_1 = spark.read.options(optsMap).json(jsonDataset_1)
- otherJsonDF_1.printSchema()
- /*
- root
- |-- address: struct (nullable = true)
- | |-- area: string (nullable = true)
- | |-- is_old: string (nullable = true)
- |-- name: string (nullable = true)
- */
- optsMap = Map("primitivesAsString" -> "false", "prefersDecimal" -> "true")
- otherJsonDF_1 = spark.read.options(optsMap).json(jsonDataset_1)
- otherJsonDF_1.printSchema()
- /*
- root
- |-- address: struct (nullable = true)
- | |-- area: decimal(7,2) (nullable = true)
- | |-- is_old: boolean (nullable = true)
- |-- name: string (nullable = true)
- */
- //allowComments(默认是false),表示是否支持json中含有java/c格式的注释
- spark.read.option("allowComments", "true").json(Seq("""{"name":/* hello */"Yin","address":{"is_old":true,"area":23000.34}}""").toDS()).show()
- //allowUnquotedFieldNames(默认是false),表示是否支持json中含有没有引号的域名
- spark.read.option("allowUnquotedFieldNames", "true").json(Seq("""{name:"Yin","address":{"is_old":true,"area":23000.34}}""").toDS()).show()
- //allowSingleQuotes(默认是true),表示是否支持json中含有单引号的域名或者值
- spark.read.option("allowSingleQuotes", "true").json(Seq("""{'name':'Yin',"address":{"is_old":true,"area":23000.34}}""").toDS()).show()
- //allowNumericLeadingZeros(默认是false),表示是否支持json中含有以0开头的数值
- spark.read.option("allowNumericLeadingZeros", "true").json(Seq("""{'name':'Yin',"address":{"is_old":true,"area":0023000.34}}""").toDS()).show()
- //allowNonNumericNumbers(默认是false),表示是否支持json中含有NaN(not a number)
- spark.read.option("allowNonNumericNumbers", "true").json(Seq("""{'name':'Yin',"address":{"is_old":true,"area":NaN}}""").toDS()).show()
- //allowBackslashEscapingAnyCharacter(默认是false),表示是否支持json中含有反斜杠,且将反斜杠忽略掉
- spark.read.option("allowBackslashEscapingAnyCharacter", "true").json(Seq("""{'name':'Yin',"address":{"is_old":true,"area":"\$23000"}}""").toDS()).show()
- //mode(默认是PERMISSIVE),表是碰到格式解析错误的json的处理行为是:
- //PERMISSIVE 表示比较宽容的。如果某条格式错误,则新增一个字段,字段名为columnNameOfCorruptRecord的值,字段的值是错误格式的json字符串,其他的是null
- spark.read.option("mode", "PERMISSIVE").json(Seq("""{'name':'Yin',"address":{"is_old":true,"area":3000}}""",
- """{'name':'Yin',"address":{"is_old":true,"area":\3000}}""").toDS()).show()
- /*
- +--------------------+-----------+----+
- | _corrupt_record| address|name|
- +--------------------+-----------+----+
- | null|[3000,true]| Yin|
- |{'name':'Yin',"ad...| null|null|
- +--------------------+-----------+----+
- */
- spark.read.option("mode", "PERMISSIVE").option("columnNameOfCorruptRecord", "customer_column").json(
- Seq("""{'name':'Yin',"address":{"is_old":true,"area":3000}}""",
- """{'name':'Yin',"address":{"is_old":true,"area":\3000}}""").toDS()).show()
- /*
- +-----------+--------------------+----+
- | address| customer_column|name|
- +-----------+--------------------+----+
- |[3000,true]| null| Yin|
- | null|{'name':'Yin',"ad...|null|
- +-----------+--------------------+----+
- */
- //DROPMALFORMED 表示丢掉错误格式的那条记录
- spark.read.option("mode", "DROPMALFORMED").json(Seq("""{'name':'Yin',"address":{"is_old":true,"area":3000}}""",
- """{'name':'Yin',"address":{"is_old":true,"area":\3000}}""").toDS()).show()
- /*
- +-----------+----+
- | address|name|
- +-----------+----+
- |[3000,true]| Yin|
- +-----------+----+
- */
- //FAILFAST 碰到解析错误的记录直接报错
- spark.read.option("mode", "FAILFAST").json(Seq("""{'name':'Yin',"address":{"is_old":true,"area":3000}}""",
- """{'name':'Yin',"address":{"is_old":true,"area":\3000}}""").toDS()).show()
- //dateFormat(默认值为yyyy-MM-dd) 表示json中时间的字符串格式(对应着DataType)
- val customSchema = new StructType(Array(StructField("name", StringType, true),
- StructField("date", DateType, true)))
- val dataFormatDF =
- spark.read.schema(customSchema).option("dateFormat", "dd/MM/yyyy HH:mm").json(Seq(
- """{'name':'Yin',"date":"26/08/2015 18:00"}""").toDS())
- dataFormatDF.write.mode(SaveMode.Overwrite).option("dateFormat", "yyyy/MM/dd").json("testjson")
- spark.read.json("testjson").show()
- //timestampFormat(默认值为yyyy-MM-dd'T'HH:mm:ss.SSSZZ) 表示json中时间的字符串格式(对应着TimestampType)
- val customSchema_1 = new StructType(Array(StructField("name", StringType, true),
- StructField("date", TimestampType, true)))
- val timestampFormatDf =
- spark.read.schema(customSchema_1).option("timestampFormat", "dd/MM/yyyy HH:mm").json(Seq(
- """{'name':'Yin',"date":"26/08/2015 18:00"}""").toDS())
- val optMap = Map("timestampFormat" -> "yyyy/MM/dd HH:mm", DateTimeUtils.TIMEZONE_OPTION -> "GMT")
- timestampFormatDf.write.mode(SaveMode.Overwrite).format("json").options(optMap).save("test.json")
- spark.read.json("test.json").show()
- //compression 压缩格式,支持的压缩格式有:
- //none 和 uncompressed表示不压缩
- //bzip2、deflate、gzip、lz4、snappy
- timestampFormatDf.write.mode(SaveMode.Overwrite).option("compression", "gzip").json("test.json")
- //multiLine 表示是否支持一条json记录拆分成多行
- val primitiveFieldAndType: Dataset[String] = spark.createDataset(spark.sparkContext.parallelize(
- """{"string":"this is a simple string.",
- "integer":10,
- "long":21474836470,
- "bigInteger":92233720368547758070,
- "double":1.7976931348623157E308,
- "boolean":true,
- "null":null
- }""" ::
- """{"string":"this is a simple string.",
- | "integer":10,
- | "long":21474836470,
- | "bigInteger":92233720368547758070,
- | "double":1.7976931348623157E308,
- | "boolean":true,
- | "null":null
- | }""" :: Nil))(Encoders.STRING)
- primitiveFieldAndType.toDF("value").write.mode(SaveMode.Overwrite).option("compression", "GzIp").text(s"${BASE_PATH}/primitiveFieldAndType")
- val multiLineDF = spark.read.option("multiLine", false).json(s"${BASE_PATH}/primitiveFieldAndType")
- multiLineDF.show()
- spark.stop()
- }
- }
SparkSQL读写外部数据源-json文件的读写的更多相关文章
- SparkSQL读写外部数据源--csv文件的读写
object CSVFileTest { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .ap ...
- SparkSQL读写外部数据源-jext文件和table数据源的读写
object ParquetFileTest { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() ...
- SparkSQL读写外部数据源-基本操作load和save
数据源-基本操作load和save object BasicTest { def main(args: Array[String]): Unit = { val spark = SparkSessio ...
- NetCore 对Json文件的读写操作
nuget Microsoft.Extensions.Configuration; Microsoft.Extensions.Configuration.Json; Newtonsoft.Json; ...
- 一文综述python读写csv xml json文件各种骚操作
Python优越的灵活性和易用性使其成为最受欢迎的编程语言之一,尤其是对数据科学家而言.这在很大程度上是因为使用Python处理大型数据集是很简单的一件事情. 如今,每家科技公司都在制定数据战略. ...
- HTML5实现本地JSON文件的读写
参考: 使用HTML5来实现本地文件读取和写入 (FileReader读取json文件,FileSaver.js保存json文件) w3school <input>标签 FileRead ...
- SparkSQL读写外部数据源--数据分区
import com.twq.dataset.Utils._ import org.apache.spark.sql.{SaveMode, SparkSession} object FileParti ...
- SparkSQL读写外部数据源-通过jdbc读写mysql数据库
object JdbcDatasourceTest { def main(args: Array[String]): Unit = { val spark = SparkSession .builde ...
- pyspark 读写csv、json文件
from pyspark import SparkContext,SparkConf import os from pyspark.sql.session import SparkSession de ...
随机推荐
- ASP.NET Core webapi json 返回时间格式问题
网站找了几个方案不好使,比如: 1: services.AddMvc().AddJsonOptions(opt => { opt.SerializerSettings.DateFormatStr ...
- .whl文件安装cuda10.0版本的pytorch1.3.0+torchvision0.4.1
$ python3 -m venv env3$ source env3/bin/activate$ cd env3/share/python-wheels 在此找到对应的版本:https://down ...
- 嵌入式02 STM32 实验11 NVIC和中断总结
一.基础知识 1.cortex-m3支持256个中断,其中包含了16个内核中断,240个外部中断 2.STM32只有84个中断,包括16个内核中断和68个可屏蔽中断 3.STM32F103上只有60个 ...
- 玩机之Honor_V10
作为一个热爱手机的Geek,自然是经历了很多的刷机和改装手机的经验,当然翻车的经验也是有的.一般来说的折腾手机都是在遇到某一版本使用以及各方面都比较稳定的时候才会选择让手机停留在哪一版本.下面我就来分 ...
- (十二)一个简单的pdf文件体
%PDF-1.0 % 文件头,说明符合PDF1.0规范 1 0 obj %对象号 产生号(修改次数) ...
- php mysqli 预处理操作数据库
用到的SQL表 CREATE TABLE `student_01` ( `id` int(11) NOT NULL AUTO_INCREMENT, `name` varchar(255) CHARAC ...
- cas sso docker部署service
cas协议: 1. 拉取镜像 docker pull apereo/cas:${tag} 2. 启动容器 docker run --name cas -p : -p : apereo/cas:v5.3 ...
- 阿里巴巴 Java 开发手册 (十三) 其它
1. [强制]在使用正则表达式时,利用好其预编译功能,可以有效加快正则匹配速度. 说明:不要在方法体内定义:Pattern pattern = Pattern.compile(规则); 2. [强制] ...
- java之spring之配置讲解
首先目录结构如下: 1. User.java package cn.sxt.vo; import java.util.Date; public class User { private String ...
- Docker 安装 MySQL 学习笔记
https://www.runoob.com/docker/docker-install-mysql.html #docker search mysql #docker pull mysql:5.6 ...