我的json文件如下:
{
"total": 3666,
"offset": 1,
"len": 2,
"workflows": [
{
"appName": "xxxx1",
"externalId": null,
"conf": null,
"run": 0,
"acl": null,
"appPath": null,
"parentId": null,
"lastModTime": "Tue, 08 Aug 2017 22:15:11 GMT",
"consoleUrl": "http://**************:11000/oozie?job=0000130-170807105041043-oozie-oozi-W",
"createdTime": "Tue, 08 Aug 2017 22:02:13 GMT",
"startTime": "Tue, 08 Aug 2017 22:02:13 GMT",
"toString": "Workflow id[0000130-170807105041043-oozie-oozi-W] status[SUCCEEDED]",
"id": "0000130-170807105041043-oozie-oozi-W",
"endTime": "Tue, 08 Aug 2017 22:15:11 GMT",
"user": "user1",
"actions": [],
"status": "SUCCEEDED",
"group": null
},
{
"appName": "xxxx2",
"externalId": null,
"conf": null,
"run": 0,
"acl": null,
"appPath": null,
"parentId": null,
"lastModTime": "Mon, 07 Aug 2017 20:16:20 GMT",
"consoleUrl": "http://**************:11000/oozie?job=0000031-170807105041043-oozie-oozi-W",
"createdTime": "Mon, 07 Aug 2017 20:15:02 GMT",
"startTime": "Mon, 07 Aug 2017 20:15:02 GMT",
"toString": "Workflow id[0000031-170807105041043-oozie-oozi-W] status[SUCCEEDED]",
"id": "0000031-170807105041043-oozie-oozi-W",
"endTime": "Mon, 07 Aug 2017 20:16:20 GMT",
"user": "user1",
"actions": [],
"status": "SUCCEEDED",
"group": null
}
]
}
我试着分析这个并把它放到一个Hive表中。我试过以下两种方法。
方法1:
select
get_json_object(json_data,'$.workflows[5].id') as id,
get_json_object(json_data,'$.workflows[5].appName') as app_name,
get_json_object(json_data,'$.workflows[5].createdTime') as created_time,
get_json_object(json_data,'$.workflows[5].startTime') as start_time,
get_json_object(json_data,'$.workflows[5].endTime') as end_time,
get_json_object(json_data,'$.workflows[5].user') as user,
get_json_object(json_data,'$.workflows[5].status') as status
from
leap_frog_audit.oozie_json_file
这给了我以下错误:
udtf只支持select子句中的单个表达式
方法2:
CREATE TABLE default.oozie_metrics AS
SELECT m.col AS id,
k.col AS appName,
c.col AS createdTime,
s.col AS start_time,
e.col AS end_time,
u.col AS user_name,
st.col AS status
FROM leap_frog_audit.oozie_json_file LATERAL VIEW explode(split(regexp_replace(get_json_object(json_data,'$.workflows[*].id'),'\\[\\"|\\"\\]',''),'\\"\\,\\"')) m LATERAL VIEW explode(split(regexp_replace(get_json_object(json_data,'$.workflows[*].appName'),'\\[\\"|\\"\\]',''),'\\"\\,\\"')) k LATERAL VIEW explode(split(regexp_replace(get_json_object(json_data,'$.workflows[*].createdTime'),'\\[\\"|\\"\\]',''),'\\"\\,\\"')) c LATERAL VIEW explode(split(regexp_replace(get_json_object(json_data,'$.workflows[*].startTime'),'\\[\\"|\\"\\]',''),'\\"\\,\\"')) s LATERAL VIEW explode(split(regexp_replace(get_json_object(json_data,'$.workflows[*].endTime'),'\\[\\"|\\"\\]',''),'\\"\\,\\"')) e LATERAL VIEW explode(split(regexp_replace(get_json_object(json_data,'$.workflows[*].user'),'\\[\\"|\\"\\]',''),'\\"\\,\\"')) u LATERAL VIEW explode(split(regexp_replace(get_json_object(json_data,'$.workflows[*].status'),'\\[\\"|\\"\\]',''),'\\"\\,\\"')) st
这需要很长时间才能完成。有什么有效的方法来获得下面的输出吗?
+--------------------------------------+----------+-------------------------------+-------------------------------+-------------------------------+-------+-----------+
| id | app_name | created_time | start_time | end_time | user | status |
+--------------------------------------+----------+-------------------------------+-------------------------------+-------------------------------+-------+-----------+
| 0000130-170807105041043-oozie-oozi-W | xxxx1 | Tue, 08 Aug 2017 22:02:13 GMT | Tue, 08 Aug 2017 22:02:13 GMT | Tue, 08 Aug 2017 22:15:11 GMT | user1 | SUCCEEDED |
+--------------------------------------+----------+-------------------------------+-------------------------------+-------------------------------+-------+-----------+
| 0000031-170807105041043-oozie-oozi-W | xxxx2 | Mon, 07 Aug 2017 20:15:02 GMT | Mon, 07 Aug 2017 20:15:02 GMT | Mon, 07 Aug 2017 20:16:20 GMT | user1 | SUCCEEDED |
+--------------------------------------+----------+-------------------------------+-------------------------------+-------------------------------+-------+-----------+
1条答案
按热度按时间kmpatx3s1#