org.apache.spark.sql.DataFrame.collect()方法的使用及代码示例

x33g5p2x  于2022-01-18 转载在 其他  
字(4.4k)|赞(0)|评价(0)|浏览(309)

本文整理了Java中org.apache.spark.sql.DataFrame.collect()方法的一些代码示例,展示了DataFrame.collect()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。DataFrame.collect()方法的具体详情如下:
包路径:org.apache.spark.sql.DataFrame
类名称:DataFrame
方法名:collect

DataFrame.collect介绍

暂无

代码示例

代码示例来源:origin: databricks/learning-spark

Row[] result = topTweets.collect();
for (Row row : result) {
 System.out.println(row.get(0));
Row[] lengths = tweetLength.collect();
for (Row row : result) {
 System.out.println(row.get(0));

代码示例来源:origin: org.wso2.carbon.analytics/org.wso2.carbon.analytics.spark.core

public static List<Record> dataFrameToRecordsList(int tenantId, String tableName,
                         DataFrame dataFrame) {
  Row[] rows = dataFrame.collect();
  List<Record> records = new ArrayList<>();
  StructType schema = dataFrame.schema();
  for (Row row : rows) {
    records.add(new Record(tenantId, tableName, convertRowAndSchemaToValuesMap(row, schema)));
  }
  return records;
}

代码示例来源:origin: amidst/toolbox

private static ArrayList<String> getColumnStates(DataFrame data, String name) {
  ArrayList<String> states = new ArrayList();
  final Row[] statesRow = data.select(name).distinct().collect();
  for (Row r : statesRow)
    states.add( r.getString(0) );
  return states;
}

代码示例来源:origin: com.cloudera.livy/livy-test-lib

@Override
public List<String> call(JobContext jc) throws Exception {
 InputStream source = getClass().getResourceAsStream("/testweet.json");
 // Save the resource as a file in HDFS (or the local tmp dir when using a local filesystem).
 URI input;
 File local = File.createTempFile("tweets", ".json", jc.getLocalTmpDir());
 Files.copy(source, local.toPath(), StandardCopyOption.REPLACE_EXISTING);
 FileSystem fs = FileSystem.get(jc.sc().sc().hadoopConfiguration());
 if ("file".equals(fs.getUri().getScheme())) {
  input = local.toURI();
 } else {
  String uuid = UUID.randomUUID().toString();
  Path target = new Path("/tmp/" + uuid + "-tweets.json");
  fs.copyFromLocalFile(new Path(local.toURI()), target);
  input = target.toUri();
 }
 SQLContext sqlctx = useHiveContext ? jc.hivectx() : jc.sqlctx();
 sqlctx.jsonFile(input.toString()).registerTempTable("tweets");
 List<String> tweetList = new ArrayList<>();
 Row[] result =
  (Row[])(sqlctx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10")
   .collect());
 for (Row r : result) {
   tweetList.add(r.toString());
 }
 return tweetList;
}

代码示例来源:origin: ddf-project/DDF

@Override
public SqlTypedResult sqlTyped(String command, Integer maxRows, DataSourceDescriptor dataSource) throws  DDFException {
 DataFrame rdd = ((SparkDDFManager) this.getManager()).getHiveContext().sql(command);
 Schema schema = SparkUtils.schemaFromDataFrame(rdd);
 int columnSize = schema.getNumColumns();
 Row[] rddRows = rdd.collect();
 List<List<SqlTypedCell>> sqlTypedResult = new ArrayList<List<SqlTypedCell>>();
 // Scan every cell and add the type information.
 for (int rowIdx = 0; rowIdx < rddRows.length; ++rowIdx) {
  List<SqlTypedCell> row = new ArrayList<SqlTypedCell>();
  for (int colIdx = 0; colIdx < columnSize; ++ colIdx) {
   // TODO: Optimize by reducing getType().
   row.add(new SqlTypedCell(schema.getColumn(colIdx).getType(), rddRows[rowIdx].get(colIdx).toString()));
  }
  sqlTypedResult.add(row);
 }
 return new SqlTypedResult(schema, sqlTypedResult);
}

代码示例来源:origin: org.wso2.carbon.analytics/org.wso2.carbon.analytics.spark.core

private AnalyticsQueryResult toResult(DataFrame dataFrame)
    throws AnalyticsExecutionException {
  int resultsLimit = this.sparkConf.getInt("carbon.spark.results.limit", -1);
  if (resultsLimit != -1) {
    return new AnalyticsQueryResult(dataFrame.schema().fieldNames(),
        convertRowsToObjects(dataFrame.limit(resultsLimit).collect()));
  } else {
    return new AnalyticsQueryResult(dataFrame.schema().fieldNames(),
        convertRowsToObjects(dataFrame.collect()));
  }
}

代码示例来源:origin: ddf-project/DDF

String sqlCmd = String.format("select distinct(%s) from %s where %s is not null", column.getName(), this.getDDF().getTableName(), column.getName());
 DataFrame sqlresult = sqlContext.sql(sqlCmd);
 Row[] rows = sqlresult.collect();
 List<String> values = new ArrayList<>();
 for(Row row: rows) {
sql = String.format("select %s from %s", sql, this.getDDF().getTableName());
DataFrame sqlResult = sqlContext.sql(sql);
Row[] rows = sqlResult.collect();
Row result = rows[0];
int i = 0;

代码示例来源:origin: phuonglh/vn.vitk

DataFrame df0 = (new SQLContext(jsc)).createDataFrame(jrdd, WhitespaceContext.class);
DataFrame df1 = model.transform(df0);
prediction = jsc.broadcast(df1.select("prediction").collect());
if (df1.count() > 0) {
  output = s.map(new WhitespaceClassificationFunction());

相关文章