Source code for pyspark.pandas.indexes.timedelta

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import warnings
from typing import cast, no_type_check, Any
from functools import partial

import pandas as pd
from pandas.api.types import is_hashable  # type: ignore[attr-defined]
import numpy as np

from pyspark import pandas as ps
from pyspark._globals import _NoValue
from pyspark.pandas.indexes.base import Index
from pyspark.pandas.missing.indexes import MissingPandasLikeTimedeltaIndex
from pyspark.pandas.series import Series
from pyspark.sql import functions as F


HOURS_PER_DAY = 24
MINUTES_PER_HOUR = 60
SECONDS_PER_MINUTE = 60
MILLIS_PER_SECOND = 1000
MICROS_PER_MILLIS = 1000

SECONDS_PER_HOUR = MINUTES_PER_HOUR * SECONDS_PER_MINUTE
SECONDS_PER_DAY = HOURS_PER_DAY * SECONDS_PER_HOUR
MICROS_PER_SECOND = MILLIS_PER_SECOND * MICROS_PER_MILLIS


[docs]class TimedeltaIndex(Index):
    """
    Immutable ndarray-like of timedelta64 data, represented internally as int64, and
    which can be boxed to timedelta objects.

    Parameters
    ----------
    data  : array-like (1-dimensional), optional
        Optional timedelta-like data to construct index with.
    unit : unit of the arg (D,h,m,s,ms,us,ns) denote the unit, optional
        Which is an integer/float number.
    freq : str or pandas offset object, optional
        One of pandas date offset strings or corresponding objects. The string
        'infer' can be passed in order to set the frequency of the index as the
        inferred frequency upon creation.
    copy  : bool
        Make a copy of input ndarray.
    name : object
        Name to be stored in the index.

    See Also
    --------
    Index : The base pandas Index type.

    Examples
    --------
    >>> from datetime import timedelta
    >>> ps.TimedeltaIndex([timedelta(1), timedelta(microseconds=2)])
    ... # doctest: +NORMALIZE_WHITESPACE
    TimedeltaIndex(['1 days 00:00:00', '0 days 00:00:00.000002'],
    dtype='timedelta64[ns]', freq=None)

    From an Series:

    >>> s = ps.Series([timedelta(1), timedelta(microseconds=2)], index=[10, 20])
    >>> ps.TimedeltaIndex(s)
    ... # doctest: +NORMALIZE_WHITESPACE
    TimedeltaIndex(['1 days 00:00:00', '0 days 00:00:00.000002'],
    dtype='timedelta64[ns]', freq=None)

    From an Index:

    >>> idx = ps.TimedeltaIndex([timedelta(1), timedelta(microseconds=2)])
    >>> ps.TimedeltaIndex(idx)
    ... # doctest: +NORMALIZE_WHITESPACE
    TimedeltaIndex(['1 days 00:00:00', '0 days 00:00:00.000002'],
    dtype='timedelta64[ns]', freq=None)
    """

    @no_type_check
    def __new__(
        cls,
        data=None,
        unit=None,
        freq=_NoValue,
        closed=None,
        dtype=None,
        copy=False,
        name=None,
    ) -> "TimedeltaIndex":
        if closed is not None:
            warnings.warn(
                "The 'closed' keyword in TimedeltaIndex construction is deprecated "
                "and will be removed in a future version.",
                FutureWarning,
            )
        if not is_hashable(name):
            raise TypeError("Index.name must be a hashable type")

        if isinstance(data, (Series, Index)):
            if dtype is None:
                dtype = "timedelta64[ns]"
            return cast(TimedeltaIndex, Index(data, dtype=dtype, copy=copy, name=name))

        kwargs = dict(
            data=data,
            unit=unit,
            closed=closed,
            dtype=dtype,
            copy=copy,
            name=name,
        )
        if freq is not _NoValue:
            kwargs["freq"] = freq

        return cast(TimedeltaIndex, ps.from_pandas(pd.TimedeltaIndex(**kwargs)))

    def __getattr__(self, item: str) -> Any:
        if hasattr(MissingPandasLikeTimedeltaIndex, item):
            property_or_func = getattr(MissingPandasLikeTimedeltaIndex, item)
            if isinstance(property_or_func, property):
                return property_or_func.fget(self)
            else:
                return partial(property_or_func, self)

        raise AttributeError("'TimedeltaIndex' object has no attribute '{}'".format(item))

    @property
    def days(self) -> Index:
        """
        Number of days for each element.
        """

        def pandas_days(x) -> np.int64:  # type: ignore[no-untyped-def]
            return x.days

        return Index(self.to_series().transform(pandas_days))

    @property
    def seconds(self) -> Index:
        """
        Number of seconds (>= 0 and less than 1 day) for each element.
        """

        @no_type_check
        def get_seconds(scol):
            hour_scol = F.date_part(F.lit("HOUR"), scol)
            minute_scol = F.date_part(F.lit("MINUTE"), scol)
            second_scol = F.date_part(F.lit("SECOND"), scol)
            return (
                F.when(
                    hour_scol < 0,
                    SECONDS_PER_DAY + hour_scol * SECONDS_PER_HOUR,
                ).otherwise(hour_scol * SECONDS_PER_HOUR)
                + F.when(
                    minute_scol < 0,
                    SECONDS_PER_DAY + minute_scol * SECONDS_PER_MINUTE,
                ).otherwise(minute_scol * SECONDS_PER_MINUTE)
                + F.when(
                    second_scol < 0,
                    SECONDS_PER_DAY + second_scol,
                ).otherwise(second_scol)
            ).cast("int")

        return Index(self.to_series().spark.transform(get_seconds))

    @property
    def microseconds(self) -> Index:
        """
        Number of microseconds (>= 0 and less than 1 second) for each element.
        """

        @no_type_check
        def get_microseconds(scol):
            second_scol = F.date_part(F.lit("SECOND"), scol)
            return (
                (
                    F.when(
                        (second_scol >= 0) & (second_scol < 1),
                        second_scol,
                    )
                    .when(second_scol < 0, 1 + second_scol)
                    .otherwise(0)
                )
                * MICROS_PER_SECOND
            ).cast("int")

        return Index(self.to_series().spark.transform(get_microseconds))

    @no_type_check
    def all(self, *args, **kwargs) -> None:
        raise TypeError("Cannot perform 'all' with this index type: %s" % type(self).__name__)