from pyspark.sql.functions import *
spark.conf.set("spark.sql.autoBroadcastJoinThreshold","-1")
spark.conf.set("spark.sql.shuffle.partitions","3")
spark.conf.set("spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes","-1")
df1 = spark.range(10000000000).withColumn("id",lit("x"))
extravalues=spark.range(4).withColumn("id",lit("y"))
more = spark.range(4).withColumn("id",lit("z"))
df1=df1.union(extravalues).union(more)
df2 = spark.range(1000000).withColumn("id",lit("x"))
df2_extra = spark.range(10).withColumn("id",lit("y"))
df2_more = spark.range(10).withColumn("id",lit("z"))
df2=df2.union(df2_extra).union(df2_more)
output = df1.join(df2,df1.id==df2.id).select(df1.id)
output.write.parquet('s3a://...',mode='overwrite')
spark.sql.adaptive.skewjoin.skewedpartitionfactor=2 spark.sql.adaptive.localshufflereader.enabled=true
倾斜联接失败
暂无答案!
目前还没有任何答案,快来回答吧!