本文整理了Java中org.apache.spark.sql.DataFrame.javaRDD()
方法的一些代码示例,展示了DataFrame.javaRDD()
的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。DataFrame.javaRDD()
方法的具体详情如下:
包路径:org.apache.spark.sql.DataFrame
类名称:DataFrame
方法名:javaRDD
暂无
代码示例来源:origin: phuonglh/vn.vitk
private JavaRDD<String> toTaggedSentence(DataFrame output) {
return output.javaRDD().map(new Function<Row, String>() {
private static final long serialVersionUID = 4208643510231783579L;
@Override
public String call(Row row) throws Exception {
String[] tokens = row.getString(0).trim().split("\\s+");
String[] tags = row.getString(1).trim().split("\\s+");
if (tokens.length != tags.length) {
System.err.println("Incompatible lengths!");
return null;
}
StringBuilder sb = new StringBuilder(64);
for (int j = 0; j < tokens.length; j++) {
sb.append(tokens[j]);
sb.append('/');
sb.append(tags[j]);
sb.append(' ');
}
return sb.toString().trim();
}
});
}
代码示例来源:origin: stackoverflow.com
DataFrame parquetData = sqlContext.read().parquet("/Users/leewallen/dev/spark_data/out/ParquetData");
parquetData.registerTempTable("pd");
DataFrame idGroupsDataFrame = sqlContext.sql("select id_groups.ids from pd");
List<String> idList = idGroupsDataFrame.javaRDD()
.map((Function<Row, String>) row -> {
List<String> ids = new ArrayList<>();
List<WrappedArray<String>> wrappedArrayList = row.getList(0);
java.util.Iterator<WrappedArray<String>> wrappedArrayIterator = wrappedArrayList.iterator();
while (wrappedArrayIterator.hasNext()) {
WrappedArray<String> idWrappedArray = wrappedArrayIterator.next();
Iterator<String> stringIter = idWrappedArray.iterator();
List<String> tempIds = new ArrayList<>();
while (stringIter.hasNext()) {
tempIds.add(stringIter.next());
}
ids.add(tempIds.stream()
.reduce((s1, s2) -> String.format("%s,%s", s1, s2))
.get());
}
return ids.stream()
.reduce((s1, s2) -> String.format("%s|%s", s1, s2))
.get();
}).collect();
idList.forEach(id -> System.out.println(id));
代码示例来源:origin: psal/jstylo
/**
* Converts the DataMap to JavaRDD<LabeledPoint> which is what most Spark mllib classes need to perform classification.
* @param sql
* @param map
* @param labels
* @return
*/
public static JavaRDD<LabeledPoint> DataFrameToLabeledPointRDD(DataFrame df){
return df.javaRDD().map(new LabeledFromRow());
}
代码示例来源:origin: phuonglh/vn.vitk
/**
* Evaluates the accuracy of a CMM model on a data frame on tagged sentences.
* @param dataset
* @return evaluation measures.
*/
public float[] evaluate(DataFrame dataset) {
List<String> correctSequences = dataset.javaRDD().map(new RowToStringFunction(1)).collect();
long beginTime = System.currentTimeMillis();
DataFrame output = cmmModel.transform(dataset);
long endTime = System.currentTimeMillis();
if (verbose) {
System.out.println(" Number of sentences = " + correctSequences.size());
long duration = (endTime - beginTime);
System.out.println(" Total tagging time = " + duration + " ms.");
System.out.println("Average tagging time = " + ((float)duration) / correctSequences.size() + " ms.");
}
List<String> automaticSequences = output.javaRDD().map(new RowToStringFunction(1)).collect();
return Evaluator.evaluate(automaticSequences, correctSequences);
}
代码示例来源:origin: oeljeklaus-you/UserActionAnalyzePlatform
/**
* 获取指定日期范围内的数据
* @param sc
* @param taskParam
* @return
*/
private static JavaRDD<Row> getActionRDD(SQLContext sc, JSONObject taskParam)
{
String startTime=ParamUtils.getParam(taskParam,Constants.PARAM_STARTTIME);
String endTime=ParamUtils.getParam(taskParam,Constants.PARAM_ENDTIME);
String sql="select *from user_visit_action where date>='"+startTime+"' and date<='"+endTime+"'";
DataFrame df=sc.sql(sql);
return df.javaRDD();
}
代码示例来源:origin: Erik-ly/SprakProject
/**
* 获取指定日期范围内的用户访问行为数据
* @param sqlContext SQLContext
* @param taskParam 任务参数
* @return 行为数据RDD
*/
private static JavaRDD<Row> getActionRDDByDateRange(
SQLContext sqlContext, JSONObject taskParam) {
//先在Constants.java中添加任务相关的常量
//String PARAM_START_DATE = "startDate";
//String PARAM_END_DATE = "endDate";
String startDate = ParamUtils.getParam(taskParam, Constants.PARAM_START_DATE);
String endDate = ParamUtils.getParam(taskParam, Constants.PARAM_END_DATE);
String sql = "select * "
+ "from user_visit_action"
+ "where date>='" + startDate + "'"
+ "and date<='" + endDate + "'";
DataFrame actionDF = sqlContext.sql(sql);
return actionDF.javaRDD();
}
代码示例来源:origin: oeljeklaus-you/UserActionAnalyzePlatform
JavaRDD<Row> userInfoRDD=sc.sql(sql).javaRDD();
代码示例来源:origin: phuonglh/vn.vitk
/**
* Tags a list of sequences and returns a list of tag sequences.
* @param sentences
* @return a list of tagged sequences.
*/
public List<String> tag(List<String> sentences) {
List<Row> rows = new LinkedList<Row>();
for (String sentence : sentences) {
rows.add(RowFactory.create(sentence));
}
StructType schema = new StructType(new StructField[]{
new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
});
SQLContext sqlContext = new SQLContext(jsc);
DataFrame input = sqlContext.createDataFrame(rows, schema);
if (cmmModel != null) {
DataFrame output = cmmModel.transform(input).repartition(1);
return output.javaRDD().map(new RowToStringFunction(1)).collect();
} else {
System.err.println("Tagging model is null. You need to create or load a model first.");
return null;
}
}
代码示例来源:origin: Erik-ly/SprakProject
JavaRDD<Row> userInfoRDD = sqlContext.sql(sql).javaRDD();
代码示例来源:origin: phuonglh/vn.vitk
@Override
public DataFrame transform(DataFrame dataset) {
JavaRDD<Row> output = dataset.javaRDD().map(new DecodeFunction());
StructType schema = new StructType(new StructField[]{
new StructField("sentence", DataTypes.StringType, false, Metadata.empty()),
new StructField("prediction", DataTypes.StringType, false, Metadata.empty())
});
return dataset.sqlContext().createDataFrame(output, schema);
}
代码示例来源:origin: phuonglh/vn.vitk
MarkovOrder order = MarkovOrder.values()[(Integer)params.getOrDefault(params.getMarkovOrder())-1];
ContextExtractor contextExtractor = new ContextExtractor(order, Constants.REGEXP_FILE);
JavaRDD<LabeledContext> contexts = contextExtractor.extract(dataset.javaRDD());
DataFrame dataFrame = dataset.sqlContext().createDataFrame(contexts, LabeledContext.class);
JavaRDD<Row> wt = df.select("word", "label").javaRDD();
JavaPairRDD<String, Set<Integer>> tagDictionary = wt.mapToPair(new PairFunction<Row, String, Set<Integer>>(){
private static final long serialVersionUID = 5865372074294028547L;
内容来源于网络,如有侵权,请联系作者删除!