scala> val df=spark.read.json("/tmp/pdf1json")
df: org.apache.spark.sql.DataFrame = [age: bigint, fv: bigint ... 1 more field]

scala> df.show
+---+----+--------+
|age|  fv|    name|
+---+----+--------+
| 50|6998|   xiliu|
| 50| 866|zhangsan|
| 20| 565|zhangsan|
| 23| 565|  llihmj|
+---+----+--------+

scala> df.filter($"age">30).select("name","age").show
+--------+---+
|    name|age|
+--------+---+
|   xiliu| 50|
|zhangsan| 50|
+--------+---+

scala> df.filter($"age">30).select($"name",$"age"+1 as "age").show
+--------+---+
|    name|age|
+--------+---+
|   xiliu| 51|
|zhangsan| 51|
+--------+---+

scala> df.groupBy("age").count.show
+---+-----+
|age|count|
+---+-----+
| 50|    2|
| 23|    1|
| 20|    1|
+---+-----+

scala> val spark=SparkSession.builder().enableHiveSupport().getOrCreate()
18/09/26 14:23:26 WARN sql.SparkSession$Builder: Using an existing SparkSession; some configuration may not take effect.
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@53b85e12

scala> spark.sql("show databases").show
+------------+
|databaseName|
+------------+
|      dbtest|
|     default|
|      gamedw|
|  hive_hbase|
|     sqoopdb|
|      testdb|
|      userdb|
+------------+

scala> spark.sql("use gamedw").show
++
||
++
++

scala> spark.sql("show tables").show
+--------+----------------+-----------+
|database|       tableName|isTemporary|
+--------+----------------+-----------+
|  gamedw|         account|      false|
|  gamedw|account_ix_accid|      false|
|  gamedw|        cityinfo|      false|
|  gamedw|            cust|      false|
|  gamedw|          cust00|      false|
|  gamedw|           cust1|      false|
|  gamedw|          cust_1|      false|
|  gamedw|       cust_copy|      false|
|  gamedw|      cust_index|      false|
|  gamedw|       customers|      false|
|  gamedw|       employees|      false|
|  gamedw|         loginfo|      false|
|  gamedw|            mess|      false|
|  gamedw|       roleinfor|      false|
|  gamedw|          t_name|      false|
|  gamedw|         t_name1|      false|
|  gamedw|         t_name2|      false|
|  gamedw|table_index_test|      false|
|  gamedw|      table_test|      false|
|  gamedw|       tb_bucket|      false|
+--------+----------------+-----------+
only showing top 20 rows

scala> spark.sql("select * from gamedw.account limit 10").show
+-----------+--------+------+--------+--------------------+
|accountname|   accid|platid|  dateid|          createtime|
+-----------+--------+------+--------+--------------------+
|    1004210| 1004210|     6|20180116|2018-01-16 10:39:...|
|   20946754|20946754|     0|20170913|2017-09-13 10:02:...|
|   20946766|20946766|     0|20170901|2017-09-01 16:51:...|
|   20946793|20946793|     0|20171117|2017-11-17 16:51:...|
|   20946796|20946796|     0|20180110|2018-01-10 13:30:...|
|   20946962|20946962|     0|20171219|2017-12-19 15:43:...|
|   20957641|20957641|     0|20171117|2017-11-17 17:44:...|
|   20957642|20957642|     0|20171220|2017-12-20 15:32:...|
|   20963649|20963649|     6|20171220|2017-12-20 10:13:...|
|   20963674|20963674|    33|20171219|2017-12-19 22:59:...|
+-----------+--------+------+--------+--------------------+

scala> val df=spark.sql("select * from gamedw.account")
df: org.apache.spark.sql.DataFrame = [accountname: bigint, accid: bigint ... 3 more fields]

将df分桶排序保存到hive

scala> df.write.bucketBy(10,"platid").sortBy("createtime").saveAsTable("acc_tb")
18/09/26 14:28:07 WARN hive.HiveExternalCatalog: Persisting bucketed data source table `gamedw`.`acc_tb` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.

scala> spark.sql("show tables like 'acc_tb'").show
+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|  gamedw|   acc_tb|      false|
+--------+---------+-----------+

查看hive

hive> show create table acc_tb;
OK
CREATE TABLE `acc_tb`(
  `col` array<string> COMMENT 'from deserializer')
ROW FORMAT SERDE
  'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
  'path'='hdfs://localhost:9000/user/hive/warehouse/gamedw.db/acc_tb')
STORED AS INPUTFORMAT
  'org.apache.hadoop.mapred.SequenceFileInputFormat'
OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat'
LOCATION
  'hdfs://localhost:9000/user/hive/warehouse/gamedw.db/acc_tb'
TBLPROPERTIES (
  'COLUMN_STATS_ACCURATE'='false',
  'numFiles'='12',
  'numRows'='-1',
  'rawDataSize'='-1',
  'spark.sql.sources.provider'='parquet',
  'spark.sql.sources.schema.bucketCol.0'='platid',
  'spark.sql.sources.schema.numBucketCols'='1',
  'spark.sql.sources.schema.numBuckets'='10',
  'spark.sql.sources.schema.numParts'='1',
  'spark.sql.sources.schema.numSortCols'='1',
  'spark.sql.sources.schema.part.0'='{\"type\":\"struct\",\"fields\":[{\"name\":\"accountname\",\"type\":\"long\",\"nullable\":true,\"metadata\":{\"HIVE_TYPE_STRING\":\"bigint\"}},{\"name\":\"accid\",\"type\":\"long\",\"nullable\":true,\"metadata\":{\"HIVE_TYPE_STRING\":\"bigint\"}},{\"name\":\"platid\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{\"HIVE_TYPE_STRING\":\"int\"}},{\"name\":\"dateid\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{\"HIVE_TYPE_STRING\":\"int\"}},{\"name\":\"createtime\",\"type\":\"string\",\"nullable\":true,\"metadata\":{\"HIVE_TYPE_STRING\":\"string\"}}]}',
  'spark.sql.sources.schema.sortCol.0'='createtime',
  'totalSize'='1212087',
  'transient_lastDdlTime'='1537943287')
Time taken: 0.369 seconds, Fetched: 27 row(s)
hive> select * from acc_tb;
OK
Failed with exception java.io.IOException:java.io.IOException: hdfs://localhost:9000/user/hive/warehouse/gamedw.db/acc_tb/part-00000-e275c19a-1728-49e2-bd6f-8598d76fe045_00000.c000.snappy.parquet not a SequenceFile

查看hdfs:

[root@host ~]# hdfs dfs -ls /user/hive/warehouse/gamedw.db/acc_tb
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/root/hadoop/hadoop-2.7.4/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/root/hive/apache-hive-2.1.1/lib/log4j-slf4j-impl-2.4.1.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
Found 13 items
-rw-r--r--   1 root supergroup          0 2018-09-26 14:28 /user/hive/warehouse/gamedw.db/acc_tb/_SUCCESS
-rw-r--r--   1 root supergroup      23357 2018-09-26 14:28 /user/hive/warehouse/gamedw.db/acc_tb/part-00000-e275c19a-1728-49e2-bd6f-8598d76fe045_00000.c000.snappy.parquet
-rw-r--r--   1 root supergroup      89069 2018-09-26 14:28 /user/hive/warehouse/gamedw.db/acc_tb/part-00000-e275c19a-1728-49e2-bd6f-8598d76fe045_00001.c000.snappy.parquet
-rw-r--r--   1 root supergroup     299566 2018-09-26 14:28 /user/hive/warehouse/gamedw.db/acc_tb/part-00000-e275c19a-1728-49e2-bd6f-8598d76fe045_00002.c000.snappy.parquet
-rw-r--r--   1 root supergroup     272843 2018-09-26 14:28 /user/hive/warehouse/gamedw.db/acc_tb/part-00000-e275c19a-1728-49e2-bd6f-8598d76fe045_00003.c000.snappy.parquet
-rw-r--r--   1 root supergroup      66923 2018-09-26 14:28 /user/hive/warehouse/gamedw.db/acc_tb/part-00000-e275c19a-1728-49e2-bd6f-8598d76fe045_00004.c000.snappy.parquet
-rw-r--r--   1 root supergroup      67756 2018-09-26 14:28 /user/hive/warehouse/gamedw.db/acc_tb/part-00000-e275c19a-1728-49e2-bd6f-8598d76fe045_00005.c000.snappy.parquet
-rw-r--r--   1 root supergroup      15025 2018-09-26 14:28 /user/hive/warehouse/gamedw.db/acc_tb/part-00000-e275c19a-1728-49e2-bd6f-8598d76fe045_00006.c000.snappy.parquet
-rw-r--r--   1 root supergroup     138852 2018-09-26 14:28 /user/hive/warehouse/gamedw.db/acc_tb/part-00000-e275c19a-1728-49e2-bd6f-8598d76fe045_00007.c000.snappy.parquet
-rw-r--r--   1 root supergroup      18598 2018-09-26 14:28 /user/hive/warehouse/gamedw.db/acc_tb/part-00000-e275c19a-1728-49e2-bd6f-8598d76fe045_00008.c000.snappy.parquet
-rw-r--r--   1 root supergroup     217081 2018-09-26 14:28 /user/hive/warehouse/gamedw.db/acc_tb/part-00000-e275c19a-1728-49e2-bd6f-8598d76fe045_00009.c000.snappy.parquet
-rw-r--r--   1 root supergroup       1378 2018-09-26 14:28 /user/hive/warehouse/gamedw.db/acc_tb/part-00001-e275c19a-1728-49e2-bd6f-8598d76fe045_00007.c000.snappy.parquet
-rw-r--r--   1 root supergroup       1639 2018-09-26 14:28 /user/hive/warehouse/gamedw.db/acc_tb/part-00003-e275c19a-1728-49e2-bd6f-8598d76fe045_00006.c000.snappy.parquet

hive查看数据:

hive> select * from acc_tb;
OK
Failed with exception java.io.IOException:java.io.IOException: hdfs://localhost:9000/user/hive/warehouse/gamedw.db/acc_tb/part-00000-e275c19a-1728-49e2-bd6f-8598d76fe045_00000.c000.snappy.parquet not a SequenceFile
Time taken: 0.405 seconds

发现无法查询

在spar-shell尝试:
scala> spark.sql("select * from acc_tb").show
+-----------+--------+------+--------+--------------------+
|accountname|   accid|platid|  dateid|          createtime|
+-----------+--------+------+--------+--------------------+
|   22753184|22753184|    60|20180110|2018-01-10 15:17:...|
|   22755578|22755578|    60|20180111|2018-01-11 15:57:...|
|   22764087|22764087|    60|20180115|2018-01-15 14:32:...|
|   22766072|22766072|    60|20180116|2018-01-16 09:59:...|
|   22766191|22766191|    60|20180116|2018-01-16 10:03:...|
|   22766762|22766762|    60|20180116|2018-01-16 10:11:...|
|   22766933|22766933|    60|20180116|2018-01-16 10:15:...|
|   22767239|22767239|    60|20180116|2018-01-16 10:23:...|
|   22767249|22767249|    60|20180116|2018-01-16 10:23:...|
|   22767356|22767356|    60|20180116|2018-01-16 10:27:...|
|   22767396|22767396|    60|20180116|2018-01-16 10:28:...|
|   22767628|22767628|    60|20180116|2018-01-16 10:36:...|
|   22767650|22767650|    60|20180116|2018-01-16 10:38:...|
|   22767690|22767690|    60|20180116|2018-01-16 10:39:...|
|   22767710|22767710|    60|20180116|2018-01-16 10:40:...|
|   22767744|22767744|    60|20180116|2018-01-16 10:43:...|
|   22767862|22767862|    60|20180116|2018-01-16 10:45:...|
|   22767916|22767916|    60|20180116|2018-01-16 10:48:...|
|   22768016|22768016|    60|20180116|2018-01-16 10:57:...|
|   22768171|22768171|    60|20180116|2018-01-16 10:57:...|
+-----------+--------+------+--------+--------------------+
only showing top 20 rows

成功

分区分桶保存到hive表

scala> df.write.partitionBy("platid").bucketBy(5,"dateid").sortBy("createtime").saveAsTable("acc_plat")
18/09/26 14:48:39 WARN hive.HiveExternalCatalog: Persisting bucketed data source table `gamedw`.`acc_plat` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.

查看hive

hive> show create table acc_plat;
OK
CREATE TABLE `acc_plat`(
  `col` array<string> COMMENT 'from deserializer')
PARTITIONED BY (
  `platid` int)
ROW FORMAT SERDE
  'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
  'path'='hdfs://localhost:9000/user/hive/warehouse/gamedw.db/acc_plat')
STORED AS INPUTFORMAT
  'org.apache.hadoop.mapred.SequenceFileInputFormat'
OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat'
LOCATION
  'hdfs://localhost:9000/user/hive/warehouse/gamedw.db/acc_plat'
TBLPROPERTIES (
  'spark.sql.partitionProvider'='catalog',
  'spark.sql.sources.provider'='parquet',
  'spark.sql.sources.schema.bucketCol.0'='dateid',
  'spark.sql.sources.schema.numBucketCols'='1',
  'spark.sql.sources.schema.numBuckets'='5',
  'spark.sql.sources.schema.numPartCols'='1',
  'spark.sql.sources.schema.numParts'='1',
  'spark.sql.sources.schema.numSortCols'='1',
  'spark.sql.sources.schema.part.0'='{\"type\":\"struct\",\"fields\":[{\"name\":\"accountname\",\"type\":\"long\",\"nullable\":true,\"metadata\":{\"HIVE_TYPE_STRING\":\"bigint\"}},{\"name\":\"accid\",\"type\":\"long\",\"nullable\":true,\"metadata\":{\"HIVE_TYPE_STRING\":\"bigint\"}},{\"name\":\"dateid\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{\"HIVE_TYPE_STRING\":\"int\"}},{\"name\":\"createtime\",\"type\":\"string\",\"nullable\":true,\"metadata\":{\"HIVE_TYPE_STRING\":\"string\"}},{\"name\":\"platid\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{\"HIVE_TYPE_STRING\":\"int\"}}]}',
  'spark.sql.sources.schema.partCol.0'='platid',
  'spark.sql.sources.schema.sortCol.0'='createtime',
  'transient_lastDdlTime'='1537944522')
Time taken: 0.22 seconds, Fetched: 27 row(s)

hive> select * from acc_plat;
OK
Failed with exception java.io.IOException:java.io.IOException: hdfs://localhost:9000/user/hive/warehouse/gamedw.db/acc_plat/platid=0/part-00000-ef876036-ec6e-4751-b28c-e8f77a3c5b31_00000.c000.snappy.parquet not a SequenceFile
Time taken: 4.071 seconds

查看hdfs:

[root@host ~]# hdfs dfs -ls -R /user/hive/warehouse/gamedw.db/acc_plat
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/root/hadoop/hadoop-2.7.4/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/root/hive/apache-hive-2.1.1/lib/log4j-slf4j-impl-2.4.1.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
-rw-r--r--   1 root supergroup          0 2018-09-26 14:48 /user/hive/warehouse/gamedw.db/acc_plat/_SUCCESS
drwxr-xr-x   - root supergroup          0 2018-09-26 14:48 /user/hive/warehouse/gamedw.db/acc_plat/platid=0
-rw-r--r--   1 root supergroup       9695 2018-09-26 14:48 /user/hive/warehouse/gamedw.db/acc_plat/platid=0/part-00000-ef876036-ec6e-4751-b28c-e8f77a3c5b31_00000.c000.snappy.parquet
-rw-r--r--   1 root supergroup       2701 2018-09-26 14:48 /user/hive/warehouse/gamedw.db/acc_plat/platid=0/part-00000-ef876036-ec6e-4751-b28c-e8f77a3c5b31_00001.c000.snappy.parquet
-rw-r--r--   1 root supergroup       8622 2018-09-26 14:48 /user/hive/warehouse/gamedw.db/acc_plat/platid=0/part-00000-ef876036-ec6e-4751-b28c-e8f77a3c5b31_00002.c000.snappy.parquet
-rw-r--r--   1 root supergroup       2025 2018-09-26 14:48 /user/hive/warehouse/gamedw.db/acc_plat/platid=0/part-00000-ef876036-ec6e-4751-b28c-e8f77a3c5b31_00003.c000.snappy.parquet
-rw-r--r--   1 root supergroup       8329 2018-09-26 14:48 /user/hive/warehouse/gamedw.db/acc_plat/platid=0/part-00000-ef876036-ec6e-4751-b28c-e8f77a3c5b31_00004.c000.snappy.parquet

.........................................

spark-shell 查询:

scala> spark.sql("select * from acc_plat limit 10").show
+-----------+--------+--------+--------------------+------+
|accountname|   accid|  dateid|          createtime|platid|
+-----------+--------+--------+--------------------+------+
|   22596994|22596994|20171122|2017-11-22 11:04:...|     0|
|   22600306|22600306|20171122|2017-11-22 15:42:...|     0|
|   21667705|21667705|20171201|2017-12-01 11:06:...|     0|
|   22631369|22631369|20171201|2017-12-01 17:15:...|     0|
|   22631399|22631399|20171201|2017-12-01 17:26:...|     0|
|   22631447|22631447|20171201|2017-12-01 17:46:...|     0|
|   21345007|21345007|20171206|2017-12-06 10:59:...|     0|
|   22659886|22659886|20171211|2017-12-11 14:21:...|     0|
|   22659908|22659908|20171211|2017-12-11 14:32:...|     0|
|   22659938|22659938|20171211|2017-12-11 14:46:...|     0|
+-----------+--------+--------+--------------------+------+

分区以parquet输出到指定目录

scala> df.write.partitionBy("platid").format("parquet").save("/tmp/acc_platid")

查看HDFS

[root@host ~]# hdfs dfs -ls /tmp/acc_platid
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/root/hadoop/hadoop-2.7.4/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/root/hive/apache-hive-2.1.1/lib/log4j-slf4j-impl-2.4.1.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
Found 40 items
-rw-r--r--   1 root supergroup          0 2018-09-26 14:59 /tmp/acc_platid/_SUCCESS
drwxr-xr-x   - root supergroup          0 2018-09-26 14:58 /tmp/acc_platid/platid=0
drwxr-xr-x   - root supergroup          0 2018-09-26 14:58 /tmp/acc_platid/platid=1
drwxr-xr-x   - root supergroup          0 2018-09-26 14:58 /tmp/acc_platid/platid=10000
drwxr-xr-x   - root supergroup          0 2018-09-26 14:58 /tmp/acc_platid/platid=11
drwxr-xr-x   - root supergroup          0 2018-09-26 14:58 /tmp/acc_platid/platid=13
drwxr-xr-x   - root supergroup          0 2018-09-26 14:59 /tmp/acc_platid/platid=138
drwxr-xr-x   - root supergroup          0 2018-09-26 14:58 /tmp/acc_platid/platid=15
drwxr-xr-x   - root supergroup          0 2018-09-26 14:59 /tmp/acc_platid/platid=158
drwxr-xr-x   - root supergroup          0 2018-09-26 14:58 /tmp/acc_platid/platid=17
drwxr-xr-x   - root supergroup          0 2018-09-26 14:58 /tmp/acc_platid/platid=18
drwxr-xr-x   - root supergroup          0 2018-09-26 14:59 /tmp/acc_platid/platid=187
drwxr-xr-x   - root supergroup          0 2018-09-26 14:58 /tmp/acc_platid/platid=19
drwxr-xr-x   - root supergroup          0 2018-09-26 14:59 /tmp/acc_platid/platid=191
drwxr-xr-x   - root supergroup          0 2018-09-26 14:58 /tmp/acc_platid/platid=21
drwxr-xr-x   - root supergroup          0 2018-09-26 14:59 /tmp/acc_platid/platid=219
drwxr-xr-x   - root supergroup          0 2018-09-26 14:58 /tmp/acc_platid/platid=24294
drwxr-xr-x   - root supergroup          0 2018-09-26 14:59 /tmp/acc_platid/platid=247
drwxr-xr-x   - root supergroup          0 2018-09-26 14:58 /tmp/acc_platid/platid=27
drwxr-xr-x   - root supergroup          0 2018-09-26 14:59 /tmp/acc_platid/platid=277
drwxr-xr-x   - root supergroup          0 2018-09-26 14:59 /tmp/acc_platid/platid=286
drwxr-xr-x   - root supergroup          0 2018-09-26 14:59 /tmp/acc_platid/platid=287
drwxr-xr-x   - root supergroup          0 2018-09-26 14:59 /tmp/acc_platid/platid=288
drwxr-xr-x   - root supergroup          0 2018-09-26 14:59 /tmp/acc_platid/platid=289
drwxr-xr-x   - root supergroup          0 2018-09-26 14:59 /tmp/acc_platid/platid=291
drwxr-xr-x   - root supergroup          0 2018-09-26 14:59 /tmp/acc_platid/platid=295
drwxr-xr-x   - root supergroup          0 2018-09-26 14:58 /tmp/acc_platid/platid=3
drwxr-xr-x   - root supergroup          0 2018-09-26 14:58 /tmp/acc_platid/platid=33
drwxr-xr-x   - root supergroup          0 2018-09-26 14:58 /tmp/acc_platid/platid=35
drwxr-xr-x   - root supergroup          0 2018-09-26 14:58 /tmp/acc_platid/platid=36
drwxr-xr-x   - root supergroup          0 2018-09-26 14:58 /tmp/acc_platid/platid=38
drwxr-xr-x   - root supergroup          0 2018-09-26 14:58 /tmp/acc_platid/platid=4
drwxr-xr-x   - root supergroup          0 2018-09-26 14:58 /tmp/acc_platid/platid=46
drwxr-xr-x   - root supergroup          0 2018-09-26 14:58 /tmp/acc_platid/platid=50
drwxr-xr-x   - root supergroup          0 2018-09-26 14:58 /tmp/acc_platid/platid=6
drwxr-xr-x   - root supergroup          0 2018-09-26 14:58 /tmp/acc_platid/platid=60
drwxr-xr-x   - root supergroup          0 2018-09-26 14:59 /tmp/acc_platid/platid=83
drwxr-xr-x   - root supergroup          0 2018-09-26 14:59 /tmp/acc_platid/platid=89
drwxr-xr-x   - root supergroup          0 2018-09-26 14:58 /tmp/acc_platid/platid=9
drwxr-xr-x   - root supergroup          0 2018-09-26 14:59 /tmp/acc_platid/platid=93
[root@host ~]#

cube

scala> df.filter("dateid in(20180306,20180307)").cube("platid","dateid").count.orderBy("platid").show
+------+--------+-----+
|platid|  dateid|count|
+------+--------+-----+
|  null|    null|  271|
|  null|20180306|  135|
|  null|20180307|  136|
|     1|20180306|   19|
|     1|20180307|   31|
|     1|    null|   50|
|     3|20180306|    5|
|     3|    null|   11|
|     3|20180307|    6|
|     4|    null|   41|
|     4|20180306|   21|
|     4|20180307|   20|
|     6|20180306|    5|
|     6|    null|   10|
|     6|20180307|    5|
|    13|    null|    3|
|    13|20180307|    2|
|    13|20180306|    1|
|    15|20180307|   12|
|    15|    null|   24|
+------+--------+-----+
only showing top 20 rows

scala> df.filter("dateid in(20180306,20180307)").rollup("platid","dateid").count.orderBy("platid").show
+------+--------+-----+
|platid|  dateid|count|
+------+--------+-----+
|  null|    null|  271|
|     1|    null|   50|
|     1|20180306|   19|
|     1|20180307|   31|
|     3|20180307|    6|
|     3|    null|   11|
|     3|20180306|    5|
|     4|    null|   41|
|     4|20180307|   20|
|     4|20180306|   21|
|     6|20180306|    5|
|     6|    null|   10|
|     6|20180307|    5|
|    13|    null|    3|
|    13|20180307|    2|
|    13|20180306|    1|
|    15|20180307|   12|
|    15|20180306|   12|
|    15|    null|   24|
|    18|20180307|   14|
+------+--------+-----+
only showing top 20 rows

scala> val df=spark.read.json("/tmp/pdf1json")
df: org.apache.spark.sql.DataFrame = [age: bigint, fv: bigint ... 1 more field]

scala> df.show
+---+----+--------+
|age|  fv|    name|
+---+----+--------+
| 50|6998|   xiliu|
| 50| 866|zhangsan|
| 20| 565|zhangsan|
| 23| 565|  llihmj|
+---+----+--------+

scala> df.groupBy("age").pivot("fv").count.show
+---+----+----+----+
|age| 565| 866|6998|
+---+----+----+----+
| 50|null|   1|   1|
| 23|   1|null|null|
| 20|   1|null|null|
+---+----+----+----+

scala> val df=spark.sql("select * from gamedw.account")
df: org.apache.spark.sql.DataFrame = [accountname: bigint, accid: bigint ... 3 more fields]

scala> df.filter("dateid in(20180501,20180502,20180503)").groupBy("dateid").pivot("platid").count.show
+--------+----+---+----+---+---+----+----+----+---+---+----+---+---+---+---+---+----+---+---+---+----+----+---+----+----+
|  dateid|   0|  1|   3|  4|  6|   9|  11|  13| 15| 18|  19| 21| 27| 33| 35| 36|  38| 46| 60| 83| 138| 158|191| 277| 289|
+--------+----+---+----+---+---+----+----+----+---+---+----+---+---+---+---+---+----+---+---+---+----+----+---+----+----+
|20180501|null| 12|null|  9|  4|null|null|null|  6| 33|null|  1|  9|  3|  6|  1|   1|  6|  5|  7|   1|   1|  8|   5|   1|
|20180502|null| 11|   1|  7|  4|null|   1|   2|  4| 25|   2|  1|  4|  1|  3|  3|null|  3|  5|  5|   1|null|  6|   1|null|
|20180503|   1|  6|   1|  2|  3|   1|null|   1|  3| 19|   2|  1|  6|  6|  1|  1|null|  4|  2|  8|null|null| 19|null|   1|
+--------+----+---+----+---+---+----+----+----+---+---+----+---+---+---+---+---+----+---+---+---+----+----+---+----+----+

scala> df.filter("dateid in(20180501,20180502,20180503)").groupBy("platid").pivot("dateid").count.show
+------+--------+--------+--------+
|platid|20180501|20180502|20180503|
+------+--------+--------+--------+
|    27|       9|       4|       6|
|     1|      12|      11|       6|
|    13|    null|       2|       1|
|     6|       4|       4|       3|
|     3|    null|       1|       1|
|   191|       8|       6|      19|
|    19|    null|       2|       2|
|    15|       6|       4|       3|
|     9|    null|    null|       1|
|    35|       6|       3|       1|
|     4|       9|       7|       2|
|   277|       5|       1|    null|
|    38|       1|    null|    null|
|   289|       1|    null|       1|
|    21|       1|       1|       1|
|    60|       5|       5|       2|
|    33|       3|       1|       6|
|    11|    null|       1|    null|
|    83|       7|       5|       8|
|   158|       1|    null|    null|
+------+--------+--------+--------+
only showing top 20 rows

用户自定义函数:

scala> val strlen=udf{(str:String)=>str.length}

strlen: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,IntegerType,Some(List(StringType)))

scala> spark.udf.register("slen",strlen)

res60: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,IntegerType,Some(List(StringType)))

scala> df.registerTempTable("df")
warning: there was one deprecation warning; re-run with -deprecation for details

scala> spark.sql("select name,slen(name) from df").show
+--------+---------+
|    name|UDF(name)|
+--------+---------+
|   xiliu|        5|
|zhangsan|        8|
|zhangsan|        8|
|  llihmj|        6|
+--------+---------+

sparksql进阶的更多相关文章

  1. nodejs进阶(6)—连接MySQL数据库

    1. 建库连库 连接MySQL数据库需要安装支持 npm install mysql 我们需要提前安装按mysql sever端 建一个数据库mydb1 mysql> CREATE DATABA ...

  2. nodejs进阶(4)—读取图片到页面

    我们先实现从指定路径读取图片然后输出到页面的功能. 先准备一张图片imgs/dog.jpg. file.js里面继续添加readImg方法,在这里注意读写的时候都需要声明'binary'.(file. ...

  3. JavaScript进阶之路(一)初学者的开始

    一:写在前面的问题和话 一个javascript初学者的进阶之路! 背景:3年后端(ASP.NET)工作经验,javascript水平一般般,前端水平一般般.学习资料:犀牛书. 如有误导,或者错误的地 ...

  4. nodejs进阶(3)—路由处理

    1. url.parse(url)解析 该方法将一个URL字符串转换成对象并返回. url.parse(urlStr, [parseQueryString], [slashesDenoteHost]) ...

  5. nodejs进阶(5)—接收请求参数

    1. get请求参数接收 我们简单举一个需要接收参数的例子 如果有个查找功能,查找关键词需要从url里接收,http://localhost:8000/search?keyword=地球.通过前面的进 ...

  6. nodejs进阶(1)—输出hello world

    下面将带领大家一步步学习nodejs,知道怎么使用nodejs搭建服务器,响应get/post请求,连接数据库等. 搭建服务器页面输出hello world var  http  =  require ...

  7. [C#] 进阶 - LINQ 标准查询操作概述

    LINQ 标准查询操作概述 序 “标准查询运算符”是组成语言集成查询 (LINQ) 模式的方法.大多数这些方法都在序列上运行,其中的序列是一个对象,其类型实现了IEnumerable<T> ...

  8. Java 进阶 hello world! - 中级程序员之路

    Java 进阶 hello world! - 中级程序员之路 Java是一种跨平台的语言,号称:"一次编写,到处运行",在世界编程语言排行榜中稳居第二名(TIOBE index). ...

  9. C#进阶系列——WebApi 接口返回值不困惑:返回值类型详解

    前言:已经有一个月没写点什么了,感觉心里空落落的.今天再来篇干货,想要学习Webapi的园友们速速动起来,跟着博主一起来学习吧.之前分享过一篇 C#进阶系列——WebApi接口传参不再困惑:传参详解  ...

随机推荐

  1. 黄聪:is_file和file_exists效率比较

    目前在弄文件缓存的时候用到了判定文件存在与否,is_file()还是file_exists()呢?is_file和file_exists两者效率比较起来,谁的运行速度更快呢?还是做个测试吧: 1 2 ...

  2. C#如何HttpWebRequest模拟登陆,获取服务端返回Cookie以便登录请求后使用

    public static string GetCookie(string requestUrlString, Encoding encoding, ref CookieContainer cooki ...

  3. 自定义指令 格式化input数据为非负整数

    <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8&quo ...

  4. Python单元测试框架之pytest 3 -- fixtures

    From: https://www.cnblogs.com/fnng/p/4769020.html Python单元测试框架之pytest -- fixtures 2015-08-29 13:05 b ...

  5. Gerrit安装

    1.安装gerrit [sisi@pre-srv44 ~]$ su - gerrit2Password: [gerrit2@pre-srv44 ~]$ lltotal 83872-rw-r--r-- ...

  6. Ubuntu 14.10 下SSH配置

    安装Hadoop时候需要设置SSH ,故将此过程记录下来   推荐安装 OpenSSH , Hadoop 需要通过 SSH 来启动 Slave 列表中各台主机的守护进程,因此 SSH 是必需安装的.虽 ...

  7. (转载)关于java多线程web服务器 以及相关资料转载

    1.自己实现的简单的java多线程web服务器: https://blog.csdn.net/chongshangyunxiao321/article/details/51095149 自己实现一个简 ...

  8. 使用jquery方法的时候,要注意对象是哪个,否则很容易出错

    <!DOCTYPE html><html><head><meta charset="utf-8"><title>W3Cs ...

  9. linux系统添加指定uid和gid的用户和组

    1.添加指定gid的组 groupadd -g 1001 upload # 添加了一个指定gid为1001的upload用户 2.添加指定uid的用户,并加入到指定组 useradd -u 1001 ...

  10. Ext.NET Ext.JS 常用代码片段摘录

    引言 最近写代码突然有"一把梭"的感觉, 不管三七二十一先弄上再说. 换别人的说法, 这应该是属于"做项目"风格法吧. 至于知识体系, 可以参考官方或者更权威的 ...