Supported pandas API

The following table shows the pandas APIs that implemented or non-implemented from pandas API on Spark. Some pandas API do not implement full parameters, so the third column shows missing parameters for each API.

  • ‘Y’ in the second column means it’s implemented including its whole parameter.

  • ‘N’ means it’s not implemented yet.

  • ‘P’ means it’s partially implemented with the missing of some parameters.

All API in the list below computes the data with distributed execution except the ones that require the local execution by design. For example, DataFrame.to_numpy() requires to collect the data to the driver side.

If there is non-implemented pandas API or parameter you want, you can create an Apache Spark JIRA to request or to contribute by your own.

The API list is updated based on the pandas 1.3 official API reference.

DataFrame API

API

Implemented

Missing parameters

T()

Y

abs()

Y

add()

P

axis, level, fill_value

add_prefix()

Y

add_suffix()

Y

agg()

Y

aggregate()

Y

align()

P

fill_value, method, limit, fill_axis

all()

P

skipna, level, bool_only

any()

P

skipna, level, bool_only

append()

Y

apply()

P

raw, result_type

applymap()

P

na_action

asfreq

N

asof

N

assign()

Y

astype()

P

copy, errors

at()

Y

at_time()

Y

attrs

N

axes()

Y

backfill()

Y

between_time()

P

inclusive

bfill()

Y

bool()

Y

boxplot

N

clip()

P

axis, inplace

columns()

Y

combine

N

combine_first

N

compare

N

convert_dtypes

N

copy()

Y

corr()

P

min_periods

corrwith

N

count()

P

level

cov()

P

ddof

cummax()

P

axis

cummin()

P

axis

cumprod()

P

axis

cumsum()

P

axis

describe()

P

include, exclude, datetime_is_numeric

diff()

Y

div()

P

axis, level, fill_value

divide()

P

axis, level, fill_value

dot()

Y

drop()

P

index, level, inplace, errors

drop_duplicates()

P

ignore_index

droplevel()

Y

dropna()

Y

dtypes()

Y

duplicated()

Y

empty()

Y

eq()

P

axis, level

equals()

P

axis, level

eval()

Y

ewm()

N

expanding()

Y

explode()

P

ignore_index

ffill()

Y

fillna()

P

downcast

filter()

Y

first()

Y

first_valid_index()

Y

flags

N

floordiv()

P

axis, level, fill_value

from_dict()

Y

from_records()

Y

ge()

P

axis, level

get()

Y

groupby()

Y

gt()

P

axis, level

head()

Y

hist()

Y

iat()

Y

idxmax()

P

skipna

idxmin()

P

skipna

iloc()

Y

index()

Y

infer_objects

N

info()

P

show_counts

insert()

Y

interpolate()

N

isin()

Y

isna()

Y

isnull()

Y

items()

Y

iteritems()

Y

iterrows()

Y

itertuples()

Y

join()

P

sort

keys()

Y

kurt()

P

skipna, level

kurtosis()

P

skipna, level

last()

Y

last_valid_index()

Y

le()

P

axis, level

loc()

Y

lookup

N

lt()

P

axis, level

mad()

P

skipna, level

mask()

P

inplace, axis, level, errors

max()

P

skipna, level

mean()

P

skipna, level

median()

P

skipna, level

melt()

P

col_level, ignore_index

memory_usage

N

merge()

P

sort, copy, indicator, validate

min()

P

skipna, level

mod()

P

axis, level, fill_value

mode

N

mul()

P

axis, level, fill_value

multiply()

P

axis, level, fill_value

ndim()

Y

ne()

P

axis, level

nlargest()

P

keep

notna()

Y

notnull()

Y

nsmallest()

P

keep

nunique()

Y

pad()

Y

pct_change()

P

fill_method, limit, freq

pipe()

Y

pivot()

Y

pivot_table()

Y

plot.area()

Y

plot.bar()

Y

plot.barh()

Y

plot.box()

N

plot.density()

Y

plot.hexbin()

N

plot.hist()

Y

plot.kde()

Y

plot.line()

Y

plot.pie()

Y

plot.scatter()

Y

pop()

Y

pow()

P

axis, level, fill_value

prod()

P

skipna, level

product()

P

skipna, level

quantile()

P

interpolation

query()

Y

radd()

P

axis, level, fill_value

rank()

P

axis, na_options, pct, numeric_only

rdiv()

P

axis, level, fill_value

reindex()

P

method, level, limit, tolerance

reindex_like()

P

method, limit, tolerance

rename()

Y

rename_axis()

P

copy

reorder_levels

N

replace()

P

regex, method

resample

N

reset_index()

Y

rfloordiv()

P

axis, level, fill_value

rmod()

P

axis, level, fill_value

rmul()

P

axis, level, fill_value

rolling()

Y

round()

Y

rpow()

P

axis, level, fill_value

rsub()

P

axis, level, fill_value

rtruediv()

P

axis, level, fill_value

sample()

P

weights, axis, ignore_index

select_dtypes()

Y

sem()

P

skipna

set_axis

N

set_flags

N

set_index()

P

verify_integrity

shape()

Y

shift()

P

freq, axis

size()

Y

skew()

P

skipna, level

slice_shift

N

sort_index()

P

sort_remaining, ignore_index, key

sort_values()

P

keep

sparse

N

squeeze()

Y

stack()

P

level, dropna

std()

P

skipna, level

style()

Y

sub()

P

axis, level, fill_value

subtract()

P

axis, level, fill_value

sum()

P

skipna, level

swapaxes()

Y

swaplevel()

Y

tail()

Y

take()

Y

to_clipboard()

Y

to_csv()

P

encoding, compression, quoting, line_terminator, cunksize and more. See the pandas.DataFrame.t o_csv and pyspark.pandas.DataFra me.to_csv for detail.

to_dict()

Y

to_excel()

P

storage_options

to_feather

N

to_gbq

N

to_hdf

N

to_html()

P

encoding

to_json()

P

date_format, double_precision, force_ascii, date_unit, default_handler and more. See th e pandas.DataFrame.to_json and pyspark.pandas.DataFrame. to_json for detail.

to_latex()

P

caption, label, position

to_markdown()

P

storage_options

to_numpy()

Y

to_parquet()

P

engine, storage_options

to_period

N

to_pickle

N

to_records()

Y

to_sql

N

to_stata

N

to_string()

Y

to_timestamp

N

to_xarray

N

to_xml

N

transform()

Y

transpose()

P

copy

truediv()

P

axis, level, fill_value

truncate()

Y

tshift

N

tz_convert

N

tz_localize

N

unstack()

P

level, fill_value

update()

P

filter_func, errors

value_counts

N

values()

Y

var()

Y

where()

P

inplace, level, errors

xs()

P

drop_level

I/O API

API

Implemented

Missing parameters

read_pickle

N

DataFrame.to_pickle

N

read_table()

Y

read_csv()

P

converters, true_values, false_values, skipinitialspace, skiprows and more. See the pandas.re ad_csv an d pyspark.pandas.read_csv for detail.

DataFrame.to_csv()

Y

read_fwf

N

read_clipboard()

Y

DataFrame.to_clipboard()

Y

read_excel()

P

skiprows, na_filter, decimal, skipfooter, storage_options

DataFrame.to_excel()

P

storage_options

read_json()

P

orient, typ, dtype, convert_axes, convert_dates and m ore. See the pandas.read_json and pyspark.pandas. read_json f or detail.

DataFrame.to_json()

P

date_format, double_precision, force_ascii, date_unit, default_handler and more. See the pa ndas.DataFrame.to_json and pyspark.pandas. to_json for detail.

read_html()

Y

DataFrame.to_html()

P

encoding

read_xml

N

DataFrame.to_xml

N

DataFrame.to_latex()

P

caption, label, position

read_hdf

N

read_feather

N

DataFrame.to_feather

N

read_parquet()

P

engine, storage_options, use_nullable_dtypes

DataFrame.to_parquet()

P

engine, storage_options

read_orc()

P

read_sas

N

read_spss

N

read_sql_table()

P

coerce_float, parse_dates, chunksize

read_sql_query()

P

coerce_float, params, parse_dates, chunksize, dtype

read_sql()

P

coerce_float, params, parse_dates, chunksize

DataFrame.to_sql

N

read_gbq

N

read_stata

N

DataFrame.to_stata

N

General Function API

API

Implemented

Missing parameters

melt()

P

col_level, ignore_index

pivot

N

pivot_table

N

crosstab

N

cut

N

qcut

N

merge()

P

copy, indicator, validate

merge_ordered

N

merge_asof()

Y

concat()

P

keys, levels, names, verify_integrity, copy

get_dummies()

Y

factorize

N

unique

N

wide_to_long

N

isna()

Y

isnull()

Y

notna()

Y

notnull()

Y

to_numeric()

P

errors, downcast

to_datetime()

P

dayfirst, yearfirst, utc, exact

date_range()

Y

bdate_range

N

period_range

N

timedelta_range()

Y

infer_freq

N

interval_range

N

eval

N

Series API

API

Implemented

Missing parameters

T()

Y

abs()

Y

add()

Y

add_prefix()

Y

add_suffix()

Y

agg()

P

axis

aggregate()

P

axis

align()

P

level, fill_value, method, limit, fill_axis

all()

P

bool_only, skipna, level

any()

P

bool_only, skipna, level

append()

Y

apply()

P

convert_dtype

argmax()

P

axis, skipna

argmin()

P

axis, skipna

argsort()

P

axis, kind, order

array

N

asfreq

N

asof()

P

subset

astype()

P

copy, errors

at()

Y

at_time()

Y

attrs

N

autocorr

N

axes()

Y

backfill()

Y

between()

Y

between_time()

P

inclusive

bfill()

Y

bool()

Y

cat()

Y

clip()

P

axis, inplace

combine

N

combine_first()

Y

compare()

P

align_axis

convert_dtypes

N

copy()

Y

corr()

Y

count()

Y

cov()

P

ddof

cummax()

P

axis

cummin()

P

axis

cumprod()

P

axis

cumsum()

P

axis

describe()

P

include, exclude, datetime_is_numeric

diff()

Y

div()

P

fill_value, level

divide()

P

fill_value, level

divmod()

P

fill_value, level

dot()

Y

drop()

P

columns, inplace, errors

drop_duplicates()

Y

droplevel()

P

axis

dropna()

Y

dt()

Y

dtype()

Y

dtypes()

Y

duplicated()

N

empty()

Y

eq()

Y

equals()

Y

ewm()

N

expanding()

Y

explode()

Y

factorize()

Y

ffill()

Y

fillna()

P

downcast

filter()

Y

first()

Y

first_valid_index()

Y

flags

N

floordiv()

P

fill_value, level

ge()

P

fill_value, level

get()

Y

groupby()

Y

gt()

P

fill_value, level

hasnans()

Y

head()

Y

hist()

Y

iat()

Y

idxmax()

P

axis

idxmin()

P

axis

iloc()

Y

index()

Y

infer_objects

N

interpolate()

N

is_monotonic()

Y

is_monotonic_decreasing() | Y |

is_monotonic_increasing() | Y |

is_unique()

Y

isin()

Y

isna()

Y

isnull()

Y

item()

Y

items()

Y

iteritems()

Y

keys()

Y

kurt()

P

skipna, level

kurtosis()

P

skipna, level

last()

Y

last_valid_index()

Y

le()

P

fill_value, level

loc()

Y

lt()

P

fill_value, level

mad()

P

axis, skipna, level

map()

P

na_action

mask()

P

inplace, axis, level, errors

max()

P

skipna, level

mean()

P

skipna, level

median()

P

skipna, level

memory_usage

N

min()

P

skipna, level

mod()

P

fill_value, level

mode()

Y

mul()

P

fill_value, level

multiply()

P

fill_value, level

name()

Y

nbytes

N

ndim()

Y

ne()

P

fill_value, level

nlargest()

P

keep

notna()

Y

notnull()

Y

nsmallest()

P

keep

nunique()

Y

pad()

P

downcast

pct_change()

P

fill_method, limit, freq

pipe()

Y

plot.area()

Y

plot.bar()

Y

plot.barh()

Y

plot.box()

Y

plot.density()

Y

plot.hist()

Y

plot.kde()

Y

plot.line()

Y

plot.pie()

Y

pop()

Y

pow()

P

fill_value, level

prod()

P

skipna, level

product()

P

skipna, level

quantile()

P

interpolation

radd()

P

fill_value, level

rank()

P

axis, na_option, pct, numeric_only

ravel

N

rdiv()

P

fill_value, level

rdivmod()

P

fill_value, level

reindex()

P

method, copy, level, limit, tolerance

reindex_like()

P

method, copy, limit, tolerance

rename()

P

axis

rename_axis()

P

axis, copy, inplace

reorder_levels

N

repeat()

P

axis

replace()

P

inplace, limit, regex, method

resample

N

reset_index()

Y

rfloordiv()

P

fill_value, level

rmod()

P

fill_value, level

rmul()

P

fill_value, level

rolling()

Y

round()

Y

rpow()

P

fill_value, level

rsub()

P

fill_value, level

rtruediv()

P

fill_value, level

sample()

P

weight, axis

searchsorted

N

sem()

P

skipna, level

set_axis

N

set_flags

N

shape()

Y

shift()

P

freq, axis

size()

Y

skew()

P

skipna, level

slice_shift

N

sort_index()

P

sort_remaining, ignore_index, key

sort_values()

P

axis, kind, key, ignore_index

sparse

N

squeeze()

Y

std()

P

skipna, level

str()

Y

sub()

P

fill_value, level

subtract()

P

fill_value, level

sum()

P

fill_value, level

swaplevel()

Y

tail()

Y

take()

P

axis

to_clipboard()

Y

to_csv()

P

encoding, compression, quoting, line_terminator, cunksize and more. See the pandas.Se ries.to_csv and pyspark.pandas.Series.to_c sv fo r detail.

to_dict()

Y

to_excel()

P

storage_options

to_frame()

Y

to_hdf

N

to_json()

P

date_format, double_precision, force_ascii, default_handler, storage_options

to_latex()

P

caption, label, position

to_list()

Y

to_markdown()

P

storage_options

to_numpy()

Y

to_period

N

to_pickle

N

to_sql

N

to_string()

P

min_rows

to_timestamp

N

to_xarray

N

tolist

N

transform()

Y

transpose()

Y

truediv()

P

fill_value, level

truncate()

Y

tshift

N

tz_convert

N

tz_localize

N

unique()

Y

unstack()

P

fill_value

update()

Y

value_counts()

Y

values()

Y

var()

P

skipna, level

view

N

where()

P

inplace, axis, level, errors

xs()

P

axis, drop_level

Index API

API

Implemented

Missing parameters

T()

Y

all()

Y

any()

Y

append()

Y

argmax()

P

axis

argmin()

P

axis

argsort

N

array

N

asi8()

Y

asof()

Y

asof_locs

N

astype()

P

copy

copy()

Y

delete()

Y

difference()

Y

drop()

P

errors

drop_duplicates()

P

keep

droplevel()

Y

dropna()

P

how

dtype()

Y

duplicated

N

empty()

Y

equals()

Y

factorize()

Y

fillna()

P

downcast

format

N

get_indexer

N

get_indexer_for

N

get_indexer_non_unique

N

get_level_values

N

get_loc

N

get_slice_bound

N

get_value

N

groupby

N

has_duplicates()

Y

hasnans()

Y

holds_integer

N

identical()

Y

inferred_type()

Y

insert()

Y

intersection()

P

sort

is_

N

is_all_dates()

Y

is_boolean()

Y

is_categorical()

Y

is_floating()

Y

is_integer()

Y

is_interval()

Y

is_mixed

N

is_monotonic()

Y

is_monotonic_decreasing()

Y

is_monotonic_increasing()

Y

is_numeric()

Y

is_object()

Y

is_type_compatible()

Y

is_unique()

Y

isin()

P

level

isna()

Y

isnull

N

item()

Y

join

N

map()

Y

max()

P

axis, skipna

memory_usage

N

min()

P

axis, skipna

name()

Y

names()

Y

nbytes

N

ndim()

Y

nlevels()

Y

notna()

Y

notnull()

Y

nunique()

Y

putmask

N

ravel

N

reindex

N

rename()

Y

repeat()

P

axis

searchsorted

N

set_names()

Y

set_value

N

shape()

Y

shift()

P

freq

size()

Y

slice_indexer

N

slice_locs

N

sort_values()

P

na_position, key, return_indexer

sortlevel

N

symmetric_difference()

Y

take()

P

axis, allow_fill, fill_value

to_flat_index

N

to_frame()

Y

to_list()

Y

to_native_types

N

to_numpy()

Y

to_series()

Y

tolist

N

transpose

N

union()

Y

unique()

Y

value_counts()

Y

values()

Y

view()

Y

where

N

Window API

API

Implemented

Missing parameters

Rolling.agg

N

Rolling.aggregate

N

Rolling.apply

N

Rolling.axis

N

Rolling.center

N

Rolling.closed

N

Rolling.corr

N

Rolling.count()

Y

Rolling.cov

N

Rolling.exclusions

N

Rolling. is_datetimelike

N

Rolling.kurt

N

Rolling.max()

Y

Rolling.mean()

Y

Rolling.median

N

Rolling.method

N

Rolling.min()

Y

Rolling.min_periods

N

Rolling.ndim

N

Rolling.obj

N

Rolling.on

N

Rolling.quantile

N

Rolling.sem

N

Rolling.skew

N

Rolling.std

N

Rolling.sum()

Y

Rolling.validate

N

Rolling.var

N

Rolling.win_type

N

Rolling.window

N

Expanding.agg

N

Expanding.aggregate

N

Expanding.apply

N

Expanding.axis

N

Expanding.center

N

Expanding.closed

N

Expanding.corr

N

Expanding.count()

Y

Expanding.cov

N

Expanding.exclusions

N

Expanding.is_datetimelike

N

Expanding.kurt

N

Expanding.max()

Y

Expanding.mean()

Y

Expanding.median

N

Expanding.method

N

Expanding.min()

Y

Expanding.min_periods

N

Expanding.ndim

N

Expanding.obj

N

Expanding.on

N

Expanding.quantile

N

Expanding.sem

N

Expanding.skew

N

Expanding.std

N

Expanding.sum()

Y

Expanding.validate

N

Expanding.var

N

Expanding.win_type

N

Expanding.window

N

GroupBy API

API

Implemented

Missing parameters

DataFrameGroupBy.agg()

Y

DataFrameGroupBy.aggregate()

Y

GroupBy.all()

P

skipna

GroupBy.any()

P

skipna

GroupBy.apply()

Y

GroupBy.backfill()

Y

GroupBy.bfill()

Y

boxplot

N

corr

N

corrwith

N

GroupBy.count()

Y

cov

N

GroupBy.cumcount()

Y

GroupBy.cummax()

Y

GroupBy.cummin()

Y

GroupBy.cumprod()

Y

GroupBy.cumsum()

Y

DataFrameGroupBy.describe()

P

percentiles, include, exclude, datetime_is_numeric

GroupBy.diff()

P

axis

dtypes

N

ewm

N

GroupBy.ffill()

Y

GroupBy.fillna()

P

downcast

GroupBy.filter()

Y

GroupBy.first()

P

numeric_only, min_count

GroupBy.get_group()

Y

groups

N

GroupBy.head()

Y

hist

N

GroupBy.idxmax()

P

axis

GroupBy.idxmin()

P

axis

indices

N

GroupBy.last()

P

numeric_only, min_count

mad

N

GroupBy.max()

P

numeric_only, min_count

GroupBy.mean()

P

numeric_only, engine

GroupBy.median()

Y

GroupBy.min()

Y

ndim

N

ngroup

N

ngroups

N

nth

N

GroupBy.nunique()

Y

ohlc

N

pad

N

pct_c hange

N

pipe

N

plot

N

prod

N

qua ntile

N

GroupBy.rank()

P

na_option, pct, axis

resample

N

sample

N

sem

N

GroupBy.shift()

P

freq, axis

GroupBy.size()

Y

skew

N

GroupBy.std()

P

engine

GroupBy.sum()

P

numeric_only, min_count

GroupBy.tail()

Y

take

N

GroupBy.transform()

P

engine

tshift

N

GroupBy.var()

P

engine