Search
 
SCRIPT & CODE EXAMPLE
 
CODE EXAMPLE FOR PYTHON

pyspark split dataframe by rows

from pyspark.sql.window import Window
from pyspark.sql.functions import monotonically_increasing_id, ntile

values = [(str(i),) for i in range(100)]
df = spark.createDataFrame(values, ('value',))

def split_by_row_index(df, num_partitions=4):
    # Let's assume you don't have a row_id column that has the row order
    t = df.withColumn('_row_id', monotonically_increasing_id())
    # Using ntile() because monotonically_increasing_id is discontinuous across partitions
    t = t.withColumn('_partition', ntile(num_partitions).over(Window.orderBy(t._row_id))) 
    return [t.filter(t._partition == i+1).drop('_row_id', '_partition') for i in range(partitions)]

[i.collect() for i in split_by_row_index(df)]
Source by stackoverflow.com #
 
PREVIOUS NEXT
Tagged: #pyspark #split #dataframe #rows
ADD COMMENT
Topic
Name
3+2 =