Source code for sparkcraft.utils.size_estimation

from pyspark.sql import DataFrame


[docs]def df_size_in_bytes_exact(df: DataFrame):
    """
    Calculates the exact size in memory of a DataFrame by caching it and accessing the optimized plan

    Note: BE CAREFUL WITH THIS FUNCTION BECAUSE IT WILL CACHE ALL THE DATAFRAME!!! IF YOUR DATAFRAME IS
    TOO BIG USE `estimate_df_size_in_bytes`!!

    Args:
        df: A pyspark DataFrame

    Returns:
        The exact size in bytes
    """
    df.cache().count()
    size_in_bytes = df._jdf.queryExecution().optimizedPlan().stats().sizeInBytes()
    df.unpersist(blocking=True)
    return size_in_bytes


[docs]def df_size_in_bytes_approximate(df: DataFrame, sample_perc: float = 0.05):
    """
    This method takes a sample of the input DataFrame (`sample_perc`) and applies `df_size_in_bytes_exact`
    method to it. After it calculates the exact size of the sample, it extrapolates the total size.

    Args:
        df: A PySpark DataFrame
        sample_perc: The percentage of the DataFrame to sample. By default, a 5 %

    Raises:
        ValueError: If `sample_perc` is less than or equal to 0 or if it's greater than 1.

    Returns:
        The approximate size in bytes
    """
    if sample_perc <= 0 or sample_perc > 1:
        raise ValueError("`sample_perc` must be in the interval (0, 1]")

    sample_size_in_bytes = df_size_in_bytes_exact(df.sample(sample_perc))
    return sample_size_in_bytes / sample_perc