Source code for hots.utils.tools

"""HOTS preprocessing tools."""

import numpy as np
import pandas as pd



[docs]
def build_df_from_containers(
    df_indiv: pd.DataFrame, tick_field: str, host_field: str, metrics: list[str]
) -> pd.DataFrame:
    """Aggregate individual consumption into host-level time-series.
    Ensures each host has an entry for every tick (filling missing values
    with zeros).
    """
    # Group by timestamp and host, then sum the metrics
    df_agg = df_indiv.groupby([tick_field, host_field])[metrics].sum().reset_index()

    # Ensure all timestamp-host combinations exist (fill missing with 0)
    unique_ticks = df_indiv[tick_field].unique()
    unique_hosts = df_indiv[host_field].unique()

    # Create all combinations
    all_combinations = pd.MultiIndex.from_product(
        [unique_ticks, unique_hosts], names=[tick_field, host_field]
    ).to_frame(index=False)

    # Merge and fill missing values with 0
    df_result = (
        all_combinations.merge(df_agg, on=[tick_field, host_field], how="left")
        .fillna(0)
        .sort_values([tick_field, host_field])
        .reset_index(drop=True)
    )

    return df_result




[docs]
def slice_by_time(df, col, tmin, tmax):
    """Get data between tmin and tmax in df."""
    return df[(df[col] >= tmin) & (df[col] <= tmax)]




[docs]
def check_missing_entries_df(
    df: pd.DataFrame,
    tick_field: str,
    indiv_field: str,
    host_field: str,
    metrics: list[str],
) -> pd.DataFrame:
    """Ensure all (timestamp, container) pairs exist and fill missing data."""
    # 1. Prepare unique sorted values
    all_timestamps = np.sort(df[tick_field].unique())
    all_containers = np.sort(df[indiv_field].unique())

    # 2. Build full index and reindex
    full_index = pd.MultiIndex.from_product(
        [all_timestamps, all_containers],
        names=[tick_field, indiv_field],
    )

    df = df.set_index([tick_field, indiv_field]).reindex(full_index).reset_index()

    # 3. Forward/backward fill host (intra-container)
    df[host_field] = df.groupby(indiv_field)[host_field].ffill().bfill()

    # 4. Fill metrics with zeros
    df[metrics] = df[metrics].fillna(0.0)

    # 5. Sort rows chronologically
    df.sort_values(by=[tick_field, indiv_field], ignore_index=True, inplace=True)

    return df