state:New York city:New York population:8336697
state:California city:Los Angeles population:3857799
state:Illinois city:Chicago population:2714856
state:Texas city:Houston population:2160821
state:Pennsylvania city:Philadelphia population:1547607
state:Arizona city:Phoenix population:1488750
state:Texas city:San Antonio population:1382951
state:California city:San Diego population:1338348
state:Texas city:Dallas population:1241162
state:California city:San Jose population:982765
state:Texas city:Austin population:842592
创建数据库和外部表
CREATE DATABASE IF NOT EXISTS CSP COMMENT 'City, State, Population';
DROP TABLE IF EXISTS CSP.original;
CREATE EXTERNAL TABLE IF NOT EXISTS CSP.original
(
st STRING COMMENT 'State',
ct STRING COMMENT 'City',
po STRING COMMENT 'Population'
)
COMMENT 'Original Table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/user/cloudera/csp';
选择语句以regexp输出“state:”、“city:”和“population:”文本
SELECT
regexp_extract(st, '.*:(\\w.*)', 1) AS state,
regexp_extract(ct, '.*:(\\w.*)', 1) AS city,
regexp_extract(po, '.*:(\\w.*)', 1) AS population
FROM original;
查询结果
[localhost.localdomain:21000] > select regexp_extract(st, '.*:(\\w.*)', 1) AS state, regexp_extract(ct, '.*:(\\w.*)', 1) AS city, regexp_extract(po, '.*:(\\w.*)', 1) AS population FROM original limit 11;
Query: select regexp_extract(st, '.*:(\\w.*)', 1) AS state, regexp_extract(ct, '.*:(\\w.*)', 1) AS city, regexp_extract(po, '.*:(\\w.*)', 1) AS population FROM original limit 11
+--------------+--------------+------------+
| state | city | population |
+--------------+--------------+------------+
| New York | New York | 8336697 |
| California | Los Angeles | 3857799 |
| Illinois | Chicago | 2714856 |
| Texas | Houston | 2160821 |
| Pennsylvania | Philadelphia | 1547607 |
| Arizona | Phoenix | 1488750 |
| Texas | San Antonio | 1382951 |
| California | San Diego | 1338348 |
| Texas | Dallas | 1241162 |
| California | San Jose | 982765 |
| Texas | Austin | 842592 |
+--------------+--------------+------------+
Returned 11 row(s) in 0.22s
REGISTER piggybank.jar
A = LOAD 'csp.txt' USING PigStorage('\t') AS (st:chararray,ct:chararray,po:chararray);
data = FOREACH A GENERATE
REGEX_EXTRACT(st, '.*:(\\w.*)', 1) AS (state:chararray),
REGEX_EXTRACT(ct, '.*:(\\w.*)', 1) AS (city:chararray),
REGEX_EXTRACT(po, '.*:(\\w.*)', 1) AS (population:int);
STORE data INTO 'csp' USING org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'NO_MULTILINE');
--csp.txt(输入)
state:New York city:New York population:8336697
state:California city:Los Angeles population:3857799
state:Illinois city:Chicago population:2714856
state:Texas city:Houston population:2160821
state:Pennsylvania city:Philadelphia population:1547607
state:Arizona city:Phoenix population:1488750
state:Texas city:San Antonio population:1382951
state:California city:San Diego population:1338348
state:Texas city:Dallas population:1241162
state:California city:San Jose population:982765
state:Texas city:Austin population:842592
1条答案
按热度按时间6psbrbz91#
Impala
impala中的解决方案使用与我之前发布的pig示例相同的regexp\u提取逻辑。
--csp.txt(输入文件,位于/user/cloudera/csp中)
创建数据库和外部表
选择语句以regexp输出“state:”、“city:”和“population:”文本
查询结果
Pig
对我来说,概念化这个过程最简单的方法实际上是先用pig,所以我用你的语法模拟了一个数据文件,然后用pig创建了这个程序。该程序的输出是一个csv格式的文件,如果您愿意,可以用来创建impala外部表。
--csp.清管器
--csp.txt(输入)
--csp(输出)