在sparkDataframe中加载嵌套的json文件

rekjcdws  于 2021-05-27  发布在  Spark
关注(0)|答案(1)|浏览(420)

嗨,我正在加载数据从json文件到我的数据框时,我正在做df.show我可以看到列名,但它是一个嵌套的json文件,所以有许多子列也有任何人请指导我如何可以看到所有的列和值。

A: array (nullable = true)
     |    |-- element: struct (containsNull = true)
     |    |    |-- b: string (nullable = true)
     |    |    |-- c: string (nullable = true)
     |    |    |-- d: array (nullable = true)
     |    |    |    |-- element: struct (containsNull = true)
     |    |    |    |    |-- e: double (nullable = true)
     |    |    |    |    |-- f: string (nullable = true)
     |    |    |    |    |-- g: string (nullable = true)
     |    |    |    |    |-- h: long (nullable = true)
     |    |    |    |    |-- i: string (nullable = true)
     |    |    |    |    |-- j: long (nullable = true)
     |    |    |    |    |-- k: string (nullable = true)
     |    |    |    |    |-- l: string (nullable = true)
     |    |    |    |    |-- m: array (nullable = true)
     |    |    |    |    |    |-- element: struct (containsNull = true)
     |    |    |    |    |    |    |-- n: struct (nullable = true)
     |    |    |    |    |    |    |    |-- e: double (nullable = true)
     |    |    |    |    |    |    |    |-- h: long (nullable = true)
     |    |    |    |    |    |    |    |-- j: long (nullable = true)
     |    |    |    |    |    |    |    |-- rt: long (nullable = true)
     |    |    |    |    |    |    |    |-- o: double (nullable = true)
     |    |    |    |    |    |    |    |-- p: long (nullable = true)
     |    |    |    |    |    |    |-- num: long (nullable = true)
     |    |    |    |    |-- ok: string (nullable = true)
     |    |    |    |    |-- ol: string (nullable = true)
     |    |    |    |    |-- o: double (nullable = true)
     |    |    |    |    |-- tabname: string (nullable = true)
     |    |    |    |    |-- p: long (nullable = true)
     |    |    |    |    |-- q: string (nullable = true)
     |    |    |    |    |-- val: double (nullable = true)
     |-- r: array (nullable = true)
     |    |-- element: string (containsNull = true)
     |-- s: string (nullable = true)
     |-- t: array (nullable = true)
     |    |-- element: string (containsNull = true)
     |-- u: array (nullable = true)
     |    |-- element: string (containsNull = true)
     |-- v: long (nullable = true)
     |-- w: string (nullable = true)
     |-- x: long (nullable = true)
     |-- y: string (nullable = true)
     |-- z: string (nullable = true)
     |-- ab: string (nullable = true)
     |-- ac: string (nullable = true)
     |-- ad: array (nullable = true)
     |    |-- element: struct (containsNull = true)
     |    |    |-- ek: array (nullable = true)
     |    |    |    |-- element: struct (containsNull = true)
     |    |    |    |    |-- bb: string (nullable = true)
     |    |    |    |    |-- doclst: array (nullable = true)
     |    |    |    |    |    |-- element: struct (containsNull = true)
     |    |    |    |    |    |    |-- ec: long (nullable = true)
     |    |    |    |    |    |    |-- ei: long (nullable = true)
     |    |    |    |    |    |    |-- oi: long (nullable = true)
     |    |    |    |    |-- rbb: string (nullable = true)
     |-- ada: struct (nullable = true)
     |    |-- ek: string (nullable = true)
     |    |-- ik: string (nullable = true)
     |-- address: struct (nullable = true)
     |    |-- ek: struct (nullable = true)
     |    |    |-- e: long (nullable = true)
     |    |    |-- h: long (nullable = true)
     |    |    |-- j: long (nullable = true)
     |    |    |-- o: long (nullable = true)
     |    |-- ccc: string (nullable = true)
     |    |-- ik: struct (nullable = true)
     |    |    |-- e: long (nullable = true)
     |    |    |-- h: long (nullable = true)
     |    |    |-- j: long (nullable = true)
     |    |    |-- o: long (nullable = true)
     |    |-- rk: string (nullable = true)
     |    |-- tk: struct (nullable = true)
     |    |    |-- e: long (nullable = true)
     |    |    |-- h: long (nullable = true)
     |    |    |-- j: long (nullable = true)
     |    |    |-- o: long (nullable = true)
     |-- lt: struct (nullable = true)
     |    |-- ff: struct (nullable = true)
     |    |    |-- zz: string (nullable = true)
     |    |    |-- yy: string (nullable = true)
     |    |    |-- xx: string (nullable = true)
     |-- za: long (nullable = true)
     |-- zb: string (nullable = true)
     |-- zc: long (nullable = true)

当我做df.show时,我只能看到地址,但看不到地址(完整地址、地址1、地址2等)。
我正在写下面的代码。

val sampledata = sparksession.read.json(sc.wholeTextFiles(filepath).values)
    sampledata.select(($"Address").as("Address")).show
bd1hkmkf

bd1hkmkf1#

Address 属于类型 struct ,需要从中提取列 Address 列以查看其他列。
检查以下代码

scala> df.printSchema
root
 |-- Address: struct (nullable = true)
 |    |-- Address1: struct (nullable = true)
 |    |    |-- Hno: long (nullable = true)
 |    |    |-- pin: long (nullable = true)
 |    |    |-- state: long (nullable = true)
 |    |    |-- street: long (nullable = true)
 |    |-- Address2: struct (nullable = true)
 |    |    |-- Hno: long (nullable = true)
 |    |    |-- pin: long (nullable = true)
 |    |    |-- state: long (nullable = true)
 |    |    |-- street: long (nullable = true)
 |    |-- Address3: struct (nullable = true)
 |    |    |-- Hno: long (nullable = true)
 |    |    |-- pin: long (nullable = true)
 |    |    |-- state: long (nullable = true)
 |    |    |-- street: long (nullable = true)
 |    |-- country: string (nullable = true)
 |    |-- full_Address: string (nullable = true)
scala> df.select("Address.*").printSchema
root
 |-- Address1: struct (nullable = true)
 |    |-- Hno: long (nullable = true)
 |    |-- pin: long (nullable = true)
 |    |-- state: long (nullable = true)
 |    |-- street: long (nullable = true)
 |-- Address2: struct (nullable = true)
 |    |-- Hno: long (nullable = true)
 |    |-- pin: long (nullable = true)
 |    |-- state: long (nullable = true)
 |    |-- street: long (nullable = true)
 |-- Address3: struct (nullable = true)
 |    |-- Hno: long (nullable = true)
 |    |-- pin: long (nullable = true)
 |    |-- state: long (nullable = true)
 |    |-- street: long (nullable = true)
 |-- country: string (nullable = true)
 |-- full_Address: string (nullable = true)
scala> df.select("Address.*").show(false)
+-----------------+---------------------+-----------------+-------+------------+
|Address1         |Address2             |Address3         |country|full_Address|
+-----------------+---------------------+-----------------+-------+------------+
|[7, 462036, 1, 5]|[0, 462037, 0, 25578]|[0, 462038, 0, 0]|India  |112018      |
+-----------------+---------------------+-----------------+-------+------------+
cala> val expr = array((1 to 3).map(c => array(struct(lit(s"Address${c}").as("address_no"),col(s"Address.Address${c}").as("address_details"),col("Address.country"),col("Address.full_Address")))):_*)

scala>  df.withColumn("address",explode(expr)).select(explode($"address").as("address")).select($"address.address_no",$"address.address_details.*",$"address.country",$"address.full_Address").show(false)
+----------+---+------+-----+------+-------+------------+
|address_no|Hno|pin   |state|street|country|full_Address|
+----------+---+------+-----+------+-------+------------+
|Address1  |7  |462036|1    |5     |India  |112018      |
|Address2  |0  |462037|0    |25578 |India  |112018      |
|Address3  |0  |462038|0    |0     |India  |112018      |
+----------+---+------+-----+------+-------+------------+

相关问题