org.apache.spark.rdd.RDD类的使用及代码示例

x33g5p2x  于2022-01-28 转载在 其他  
字(10.6k)|赞(0)|评价(0)|浏览(395)

本文整理了Java中org.apache.spark.rdd.RDD类的一些代码示例,展示了RDD类的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。RDD类的具体详情如下:
包路径:org.apache.spark.rdd.RDD
类名称:RDD

RDD介绍

暂无

代码示例

代码示例来源:origin: apache/tinkerpop

@Override
public boolean cp(final String sourceLocation, final String targetLocation) {
  final List<String> rdds = Spark.getRDDs().stream().filter(r -> r.name().startsWith(sourceLocation)).map(RDD::name).collect(Collectors.toList());
  if (rdds.size() == 0)
    return false;
  for (final String rdd : rdds) {
    Spark.getRDD(rdd).toJavaRDD().filter(a -> true).setName(rdd.equals(sourceLocation) ? targetLocation : rdd.replace(sourceLocation, targetLocation)).cache().count();
    // TODO: this should use the original storage level
  }
  return true;
}

代码示例来源:origin: org.apache.spark/spark-core

@Test
public void collectUnderlyingScalaRDD() {
 List<SomeCustomClass> data = new ArrayList<>();
 for (int i = 0; i < 100; i++) {
  data.add(new SomeCustomClass());
 }
 JavaRDD<SomeCustomClass> rdd = sc.parallelize(data);
 SomeCustomClass[] collected =
  (SomeCustomClass[]) rdd.rdd().retag(SomeCustomClass.class).collect();
 assertEquals(data.size(), collected.length);
}

代码示例来源:origin: OryxProject/oryx

JavaRDD<String[]> parsedRDD = trainData.map(MLFunctions.PARSE_FN);
parsedRDD.cache();
RDD<Rating> trainingRatingDataRDD = trainRatingData.rdd();
trainingRatingDataRDD.cache();
MatrixFactorizationModel model = als.run(trainingRatingDataRDD);
trainingRatingDataRDD.unpersist(false);

代码示例来源:origin: Stratio/deep-spark

assertEquals(1, inputRDDEntity.count());
List<BookEntity> books = inputRDDEntity.toJavaRDD().collect();
JavaRDD<String> words = inputRDDEntity2.toJavaRDD().flatMap(new FlatMapFunction<BookEntity, String>() {
  @Override
  public Iterable<String> call(BookEntity bookEntity) throws Exception {
JavaPairRDD<String, Long> wordCount = words.mapToPair(new PairFunction<String, String, Long>() {
  @Override
  public Tuple2<String, Long> call(String s) throws Exception {
Assert.assertEquals(WORD_COUNT_SPECTED.longValue(), ((Long) outputRDDEntity.cache().count()).longValue());

代码示例来源:origin: com.cerner.bunsen/bunsen-core

/**
 * Returns a dataset of ValueSet from the content stored at the given directory.
 */
protected Dataset<T> valueSetDatasetFromDirectory(String path) {
 JavaRDD<Tuple2<String,String>> fileNamesAndContents = this.spark.sparkContext()
   .wholeTextFiles(path, 1)
   .toJavaRDD();
 return this.spark.createDataset(fileNamesAndContents
   .map(new ToValueSet(fhirVersion))
   .rdd(), valueSetEncoder);
}

代码示例来源:origin: org.apache.pig/pig

@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
    POStream poStream) throws IOException {
  SparkUtil.assertPredecessorSize(predecessors, poStream, 1);
  RDD<Tuple> rdd = predecessors.get(0);
  StreamFunction streamFunction = new StreamFunction(poStream);
  return rdd.toJavaRDD().mapPartitions(streamFunction, true).rdd();
}

代码示例来源:origin: edu.usc.ir/age-predictor-cli

null : new AgeClassifyModelWrapper(classifyModel);
JavaRDD<String> data = spark.sparkContext().textFile(dataIn,8).toJavaRDD().cache();
JavaRDD<Row> samples = data.map(
  new Function<String, Row>() {
  public Row call(String s) throws IOException{
JavaRDD<Row> validSamples = samples.filter(
  new Function<Row, Boolean>() {
      @Override

代码示例来源:origin: edu.usc.ir/age-predictor-cli

int iterations = getIterations(params);
JavaRDD<String> data = spark.sparkContext().textFile(eventDir, 24).toJavaRDD()
  .cache();
JavaRDD<Row> samples = data.map(
  new Function<String, Row>() {
  public Row call(String s) {
  }).cache();
JavaRDD<Row> validSamples = samples.filter(

代码示例来源:origin: com.stratio.deep/deep-core

private RDD<Cells> createRDDFromFilePath(String filePath, TextFileDataTable textFileDataTable) {
  RDD<String> result = this.sc().textFile(filePath.toString(), 1);
  JavaRDD<Cells> resultCells = result.toJavaRDD().map(new MapSchemaFromLines(textFileDataTable));
  return resultCells.rdd();
}

代码示例来源:origin: edu.usc.ir/age-predictor-cli

tokenizer, featureGenerators);
JavaRDD<String> data = spark.sparkContext().textFile(dataIn, 48).toJavaRDD()
  .cache();
JavaRDD<EventWrapper> samples = data.map(new CreateEvents(wrapper)).cache();
JavaRDD<EventWrapper> validSamples = samples.filter(

代码示例来源:origin: org.apache.crunch/crunch-spark

Set<Target> targets = outputTargets.get(e.getKey());
if (targets.size() > 1) {
 rdd.rdd().cache();
  if (rdd instanceof JavaRDD) {
   outRDD = ((JavaRDD) rdd)
     .map(new MapFunction(c.applyPTypeTransforms() ? ptype.getOutputMapFn() : ident, ctxt))
     .mapToPair(new OutputConverterFunction(c));
  } else {
   outRDD = ((JavaPairRDD) rdd)
     .map(new PairMapFunction(c.applyPTypeTransforms() ? ptype.getOutputMapFn() : ident, ctxt))
     .mapToPair(new OutputConverterFunction(c));

代码示例来源:origin: crcsmnky/mongodb-spark-demo

Logger log = sc.sc().log();
log.warn("ratings = " + ratingsData.count());
log.warn("users = " + userData.count());
log.warn("movies = " + movieData.count());
JavaRDD<Rating> predictions = model.predict(usersMovies.rdd()).toJavaRDD();
  Object.class, Object.class, MongoOutputFormat.class, predictionsConfig);
sc.sc().log().info("predictionsOutput.splits() = " + predictionsOutput.splits().size());

代码示例来源:origin: org.apache.pig/pig

@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POJoinGroupSpark op) throws IOException {
  SparkUtil.assertPredecessorSizeGreaterThan(predecessors,
      op, 0);
  List<POLocalRearrange> lraOps = op.getLROps();
  POGlobalRearrangeSpark glaOp = op.getGROp();
  POPackage pkgOp = op.getPkgOp();
  int parallelism = SparkPigContext.get().getParallelism(predecessors, glaOp);
  List<RDD<Tuple2<IndexedKey, Tuple>>> rddAfterLRA = new ArrayList<RDD<Tuple2<IndexedKey, Tuple>>>();
  boolean useSecondaryKey = glaOp.isUseSecondaryKey();
  for (int i = 0; i < predecessors.size(); i++) {
    RDD<Tuple> rdd = predecessors.get(i);
    rddAfterLRA.add(rdd.map(new LocalRearrangeFunction(lraOps.get(i), glaOp),
        SparkUtil.<IndexedKey, Tuple>getTuple2Manifest()));
  }
  if (rddAfterLRA.size() == 1 && useSecondaryKey) {
    return SecondaryKeySortUtil.handleSecondarySort(rddAfterLRA.get(0), pkgOp);
  } else {
    CoGroupedRDD<Object> coGroupedRDD = new CoGroupedRDD<Object>(
        (Seq<RDD<? extends Product2<Object, ?>>>) (Object) (JavaConversions
            .asScalaBuffer(rddAfterLRA).toSeq()),
        SparkUtil.getPartitioner(glaOp.getCustomPartitioner(), parallelism),
        SparkUtil.getManifest(Object.class));
    RDD<Tuple2<IndexedKey, Seq<Seq<Tuple>>>> rdd =
        (RDD<Tuple2<IndexedKey, Seq<Seq<Tuple>>>>) (Object) coGroupedRDD;
    return rdd.toJavaRDD().map(new GroupPkgFunction(pkgOp)).rdd();
  }
}

代码示例来源:origin: com.couchbase.client/spark-connector

@SuppressWarnings({"unchecked"})
public JavaRDD<SubdocMutationResult> couchbaseSubdocMutate(List<SubdocMutationSpec> specs, String bucket) {
  return new RDDFunctions<T>(source.rdd()).couchbaseSubdocMutate(
    SparkUtil.listToSeq(specs),
    bucket,
    scala.Option.<Duration>apply(null),
    LCLIdentity.INSTANCE
  ).toJavaRDD();
}

代码示例来源:origin: com.cerner.bunsen/bunsen-core

/**
 * Returns an RDD of bundles loaded from the given path.
 *
 * @param spark the spark session
 * @param path a path to a directory of FHIR Bundles
 * @param minPartitions a suggested value for the minimal number of partitions
 * @return an RDD of FHIR Bundles
 */
public JavaRDD<BundleContainer> loadFromDirectory(SparkSession spark,
  String path,
  int minPartitions) {
 return spark.sparkContext()
   .wholeTextFiles(path, minPartitions)
   .toJavaRDD()
   .map(new ToBundle(fhirVersion));
}

代码示例来源:origin: ddf-project/DDF

@SuppressWarnings("unchecked")
@Override
public Summary[] getSummaryImpl() throws DDFException {
 RDD<Object[]> rdd = (RDD<Object[]>) this.getDDF().getRepresentationHandler().get(RDD.class, Object[].class);
 JavaRDD<Object[]> data = rdd.toJavaRDD();
 Summary[] stats = data.map(new GetSummaryMapper()).reduce(new GetSummaryReducer());
 return stats;
}

代码示例来源:origin: Stratio/deep-spark

@Test
public void createS3RDDTest() throws Exception {
  deepSparkContext = createDeepSparkContext();
  Configuration hadoopConf = mock(Configuration.class);
  when(sparkContext.hadoopConfiguration()).thenReturn(hadoopConf);
  DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
  SQLContext sqlContext = mock(SQLContext.class);
  Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
  Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);
  RDD<String> rdd = mock(RDD.class);
  JavaRDD<String> javaRDD = mock(JavaRDD.class);
  when(deepSparkContextSpy.sc().textFile(anyString(), anyInt())).thenReturn(rdd);
  doReturn(javaRDD).when(deepSparkContextSpy).textFile(anyString());
  when(rdd.toJavaRDD()).thenReturn(javaRDD);
  when(rdd.toJavaRDD().map(any(Function.class))).thenReturn(singleRdd);
  ExtractorConfig<Cells> config = createS3DeepJobConfig();
  deepSparkContextSpy.createS3RDD(config);
  verify(hadoopConf, times(1)).set("fs.s3n.awsAccessKeyId", config.getString(ExtractorConstants.S3_ACCESS_KEY_ID));
  verify(hadoopConf, times(1)).set("fs.s3n.awsSecretAccessKey", config.getString(ExtractorConstants.S3_SECRET_ACCESS_KEY));
  verify(deepSparkContextSpy.sc(), times(1)).textFile(anyString(), anyInt());
  verify(javaRDD, times(1)).map(any(Function.class));
}

代码示例来源:origin: org.apache.pig/pig

RDD<Tuple2<PartitionIndexedKey, Tuple>> skewIdxKeyRDD = rdd1.map(skewFun,
    SparkUtil.<PartitionIndexedKey, Tuple>getTuple2Manifest());
JavaRDD<Tuple2<PartitionIndexedKey, Tuple>> streamIdxKeyJavaRDD = rdd2.toJavaRDD().flatMap(streamFun);
    streamIdxKeyJavaRDD.rdd(), SparkUtil.getManifest(PartitionIndexedKey.class),
    SparkUtil.getManifest(Tuple.class));
return result.rdd();

代码示例来源:origin: org.apache.pig/pig

private JavaRDD<Tuple2<IndexedKey,Tuple>> handleSecondarySort(
    RDD<Tuple> rdd, POGlobalRearrangeSpark op, int parallelism) {
  RDD<Tuple2<Tuple, Object>> rddPair = rdd.map(new ToKeyNullValueFunction(),
      SparkUtil.<Tuple, Object>getTuple2Manifest());
  JavaPairRDD<Tuple, Object> pairRDD = new JavaPairRDD<Tuple, Object>(rddPair,
      SparkUtil.getManifest(Tuple.class),
      SparkUtil.getManifest(Object.class));
  //first sort the tuple by secondary key if enable useSecondaryKey sort
  JavaPairRDD<Tuple, Object> sorted = pairRDD.repartitionAndSortWithinPartitions(
      new HashPartitioner(parallelism),
      new PigSecondaryKeyComparatorSpark(op.getSecondarySortOrder()));
  JavaRDD<Tuple> jrdd = sorted.keys();
  JavaRDD<Tuple2<IndexedKey,Tuple>> jrddPair = jrdd.map(new ToKeyValueFunction(op));
  return jrddPair;
}

代码示例来源:origin: Impetus/Kundera

ClassTag tag = scala.reflect.ClassTag$.MODULE$.apply(m.getEntityClazz());
JavaRDD javaRDD = sparkClient.sparkContext.parallelize(s, 1, tag).toJavaRDD();
    .mapToPair(new PairFunction<Object, Object, BSONObject>()
outputConfig.set(
    "mongo.output.uri",
    buildMongoURIPath(sc.getConf().get("hostname"), sc.getConf().get("portname"), m.getSchema(),
        m.getTableName()));

相关文章