本文整理了Java中org.apache.spark.rdd.RDD
类的一些代码示例,展示了RDD
类的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。RDD
类的具体详情如下:
包路径:org.apache.spark.rdd.RDD
类名称:RDD
暂无
代码示例来源:origin: apache/tinkerpop
@Override
public boolean cp(final String sourceLocation, final String targetLocation) {
final List<String> rdds = Spark.getRDDs().stream().filter(r -> r.name().startsWith(sourceLocation)).map(RDD::name).collect(Collectors.toList());
if (rdds.size() == 0)
return false;
for (final String rdd : rdds) {
Spark.getRDD(rdd).toJavaRDD().filter(a -> true).setName(rdd.equals(sourceLocation) ? targetLocation : rdd.replace(sourceLocation, targetLocation)).cache().count();
// TODO: this should use the original storage level
}
return true;
}
代码示例来源:origin: org.apache.spark/spark-core
@Test
public void collectUnderlyingScalaRDD() {
List<SomeCustomClass> data = new ArrayList<>();
for (int i = 0; i < 100; i++) {
data.add(new SomeCustomClass());
}
JavaRDD<SomeCustomClass> rdd = sc.parallelize(data);
SomeCustomClass[] collected =
(SomeCustomClass[]) rdd.rdd().retag(SomeCustomClass.class).collect();
assertEquals(data.size(), collected.length);
}
代码示例来源:origin: OryxProject/oryx
JavaRDD<String[]> parsedRDD = trainData.map(MLFunctions.PARSE_FN);
parsedRDD.cache();
RDD<Rating> trainingRatingDataRDD = trainRatingData.rdd();
trainingRatingDataRDD.cache();
MatrixFactorizationModel model = als.run(trainingRatingDataRDD);
trainingRatingDataRDD.unpersist(false);
代码示例来源:origin: Stratio/deep-spark
assertEquals(1, inputRDDEntity.count());
List<BookEntity> books = inputRDDEntity.toJavaRDD().collect();
JavaRDD<String> words = inputRDDEntity2.toJavaRDD().flatMap(new FlatMapFunction<BookEntity, String>() {
@Override
public Iterable<String> call(BookEntity bookEntity) throws Exception {
JavaPairRDD<String, Long> wordCount = words.mapToPair(new PairFunction<String, String, Long>() {
@Override
public Tuple2<String, Long> call(String s) throws Exception {
Assert.assertEquals(WORD_COUNT_SPECTED.longValue(), ((Long) outputRDDEntity.cache().count()).longValue());
代码示例来源:origin: com.cerner.bunsen/bunsen-core
/**
* Returns a dataset of ValueSet from the content stored at the given directory.
*/
protected Dataset<T> valueSetDatasetFromDirectory(String path) {
JavaRDD<Tuple2<String,String>> fileNamesAndContents = this.spark.sparkContext()
.wholeTextFiles(path, 1)
.toJavaRDD();
return this.spark.createDataset(fileNamesAndContents
.map(new ToValueSet(fhirVersion))
.rdd(), valueSetEncoder);
}
代码示例来源:origin: org.apache.pig/pig
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
POStream poStream) throws IOException {
SparkUtil.assertPredecessorSize(predecessors, poStream, 1);
RDD<Tuple> rdd = predecessors.get(0);
StreamFunction streamFunction = new StreamFunction(poStream);
return rdd.toJavaRDD().mapPartitions(streamFunction, true).rdd();
}
代码示例来源:origin: edu.usc.ir/age-predictor-cli
null : new AgeClassifyModelWrapper(classifyModel);
JavaRDD<String> data = spark.sparkContext().textFile(dataIn,8).toJavaRDD().cache();
JavaRDD<Row> samples = data.map(
new Function<String, Row>() {
public Row call(String s) throws IOException{
JavaRDD<Row> validSamples = samples.filter(
new Function<Row, Boolean>() {
@Override
代码示例来源:origin: edu.usc.ir/age-predictor-cli
int iterations = getIterations(params);
JavaRDD<String> data = spark.sparkContext().textFile(eventDir, 24).toJavaRDD()
.cache();
JavaRDD<Row> samples = data.map(
new Function<String, Row>() {
public Row call(String s) {
}).cache();
JavaRDD<Row> validSamples = samples.filter(
代码示例来源:origin: com.stratio.deep/deep-core
private RDD<Cells> createRDDFromFilePath(String filePath, TextFileDataTable textFileDataTable) {
RDD<String> result = this.sc().textFile(filePath.toString(), 1);
JavaRDD<Cells> resultCells = result.toJavaRDD().map(new MapSchemaFromLines(textFileDataTable));
return resultCells.rdd();
}
代码示例来源:origin: edu.usc.ir/age-predictor-cli
tokenizer, featureGenerators);
JavaRDD<String> data = spark.sparkContext().textFile(dataIn, 48).toJavaRDD()
.cache();
JavaRDD<EventWrapper> samples = data.map(new CreateEvents(wrapper)).cache();
JavaRDD<EventWrapper> validSamples = samples.filter(
代码示例来源:origin: org.apache.crunch/crunch-spark
Set<Target> targets = outputTargets.get(e.getKey());
if (targets.size() > 1) {
rdd.rdd().cache();
if (rdd instanceof JavaRDD) {
outRDD = ((JavaRDD) rdd)
.map(new MapFunction(c.applyPTypeTransforms() ? ptype.getOutputMapFn() : ident, ctxt))
.mapToPair(new OutputConverterFunction(c));
} else {
outRDD = ((JavaPairRDD) rdd)
.map(new PairMapFunction(c.applyPTypeTransforms() ? ptype.getOutputMapFn() : ident, ctxt))
.mapToPair(new OutputConverterFunction(c));
代码示例来源:origin: crcsmnky/mongodb-spark-demo
Logger log = sc.sc().log();
log.warn("ratings = " + ratingsData.count());
log.warn("users = " + userData.count());
log.warn("movies = " + movieData.count());
JavaRDD<Rating> predictions = model.predict(usersMovies.rdd()).toJavaRDD();
Object.class, Object.class, MongoOutputFormat.class, predictionsConfig);
sc.sc().log().info("predictionsOutput.splits() = " + predictionsOutput.splits().size());
代码示例来源:origin: org.apache.pig/pig
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POJoinGroupSpark op) throws IOException {
SparkUtil.assertPredecessorSizeGreaterThan(predecessors,
op, 0);
List<POLocalRearrange> lraOps = op.getLROps();
POGlobalRearrangeSpark glaOp = op.getGROp();
POPackage pkgOp = op.getPkgOp();
int parallelism = SparkPigContext.get().getParallelism(predecessors, glaOp);
List<RDD<Tuple2<IndexedKey, Tuple>>> rddAfterLRA = new ArrayList<RDD<Tuple2<IndexedKey, Tuple>>>();
boolean useSecondaryKey = glaOp.isUseSecondaryKey();
for (int i = 0; i < predecessors.size(); i++) {
RDD<Tuple> rdd = predecessors.get(i);
rddAfterLRA.add(rdd.map(new LocalRearrangeFunction(lraOps.get(i), glaOp),
SparkUtil.<IndexedKey, Tuple>getTuple2Manifest()));
}
if (rddAfterLRA.size() == 1 && useSecondaryKey) {
return SecondaryKeySortUtil.handleSecondarySort(rddAfterLRA.get(0), pkgOp);
} else {
CoGroupedRDD<Object> coGroupedRDD = new CoGroupedRDD<Object>(
(Seq<RDD<? extends Product2<Object, ?>>>) (Object) (JavaConversions
.asScalaBuffer(rddAfterLRA).toSeq()),
SparkUtil.getPartitioner(glaOp.getCustomPartitioner(), parallelism),
SparkUtil.getManifest(Object.class));
RDD<Tuple2<IndexedKey, Seq<Seq<Tuple>>>> rdd =
(RDD<Tuple2<IndexedKey, Seq<Seq<Tuple>>>>) (Object) coGroupedRDD;
return rdd.toJavaRDD().map(new GroupPkgFunction(pkgOp)).rdd();
}
}
代码示例来源:origin: com.couchbase.client/spark-connector
@SuppressWarnings({"unchecked"})
public JavaRDD<SubdocMutationResult> couchbaseSubdocMutate(List<SubdocMutationSpec> specs, String bucket) {
return new RDDFunctions<T>(source.rdd()).couchbaseSubdocMutate(
SparkUtil.listToSeq(specs),
bucket,
scala.Option.<Duration>apply(null),
LCLIdentity.INSTANCE
).toJavaRDD();
}
代码示例来源:origin: com.cerner.bunsen/bunsen-core
/**
* Returns an RDD of bundles loaded from the given path.
*
* @param spark the spark session
* @param path a path to a directory of FHIR Bundles
* @param minPartitions a suggested value for the minimal number of partitions
* @return an RDD of FHIR Bundles
*/
public JavaRDD<BundleContainer> loadFromDirectory(SparkSession spark,
String path,
int minPartitions) {
return spark.sparkContext()
.wholeTextFiles(path, minPartitions)
.toJavaRDD()
.map(new ToBundle(fhirVersion));
}
代码示例来源:origin: ddf-project/DDF
@SuppressWarnings("unchecked")
@Override
public Summary[] getSummaryImpl() throws DDFException {
RDD<Object[]> rdd = (RDD<Object[]>) this.getDDF().getRepresentationHandler().get(RDD.class, Object[].class);
JavaRDD<Object[]> data = rdd.toJavaRDD();
Summary[] stats = data.map(new GetSummaryMapper()).reduce(new GetSummaryReducer());
return stats;
}
代码示例来源:origin: Stratio/deep-spark
@Test
public void createS3RDDTest() throws Exception {
deepSparkContext = createDeepSparkContext();
Configuration hadoopConf = mock(Configuration.class);
when(sparkContext.hadoopConfiguration()).thenReturn(hadoopConf);
DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
SQLContext sqlContext = mock(SQLContext.class);
Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);
RDD<String> rdd = mock(RDD.class);
JavaRDD<String> javaRDD = mock(JavaRDD.class);
when(deepSparkContextSpy.sc().textFile(anyString(), anyInt())).thenReturn(rdd);
doReturn(javaRDD).when(deepSparkContextSpy).textFile(anyString());
when(rdd.toJavaRDD()).thenReturn(javaRDD);
when(rdd.toJavaRDD().map(any(Function.class))).thenReturn(singleRdd);
ExtractorConfig<Cells> config = createS3DeepJobConfig();
deepSparkContextSpy.createS3RDD(config);
verify(hadoopConf, times(1)).set("fs.s3n.awsAccessKeyId", config.getString(ExtractorConstants.S3_ACCESS_KEY_ID));
verify(hadoopConf, times(1)).set("fs.s3n.awsSecretAccessKey", config.getString(ExtractorConstants.S3_SECRET_ACCESS_KEY));
verify(deepSparkContextSpy.sc(), times(1)).textFile(anyString(), anyInt());
verify(javaRDD, times(1)).map(any(Function.class));
}
代码示例来源:origin: org.apache.pig/pig
RDD<Tuple2<PartitionIndexedKey, Tuple>> skewIdxKeyRDD = rdd1.map(skewFun,
SparkUtil.<PartitionIndexedKey, Tuple>getTuple2Manifest());
JavaRDD<Tuple2<PartitionIndexedKey, Tuple>> streamIdxKeyJavaRDD = rdd2.toJavaRDD().flatMap(streamFun);
streamIdxKeyJavaRDD.rdd(), SparkUtil.getManifest(PartitionIndexedKey.class),
SparkUtil.getManifest(Tuple.class));
return result.rdd();
代码示例来源:origin: org.apache.pig/pig
private JavaRDD<Tuple2<IndexedKey,Tuple>> handleSecondarySort(
RDD<Tuple> rdd, POGlobalRearrangeSpark op, int parallelism) {
RDD<Tuple2<Tuple, Object>> rddPair = rdd.map(new ToKeyNullValueFunction(),
SparkUtil.<Tuple, Object>getTuple2Manifest());
JavaPairRDD<Tuple, Object> pairRDD = new JavaPairRDD<Tuple, Object>(rddPair,
SparkUtil.getManifest(Tuple.class),
SparkUtil.getManifest(Object.class));
//first sort the tuple by secondary key if enable useSecondaryKey sort
JavaPairRDD<Tuple, Object> sorted = pairRDD.repartitionAndSortWithinPartitions(
new HashPartitioner(parallelism),
new PigSecondaryKeyComparatorSpark(op.getSecondarySortOrder()));
JavaRDD<Tuple> jrdd = sorted.keys();
JavaRDD<Tuple2<IndexedKey,Tuple>> jrddPair = jrdd.map(new ToKeyValueFunction(op));
return jrddPair;
}
代码示例来源:origin: Impetus/Kundera
ClassTag tag = scala.reflect.ClassTag$.MODULE$.apply(m.getEntityClazz());
JavaRDD javaRDD = sparkClient.sparkContext.parallelize(s, 1, tag).toJavaRDD();
.mapToPair(new PairFunction<Object, Object, BSONObject>()
outputConfig.set(
"mongo.output.uri",
buildMongoURIPath(sc.getConf().get("hostname"), sc.getConf().get("portname"), m.getSchema(),
m.getTableName()));
内容来源于网络,如有侵权,请联系作者删除!