Source code for hots.utils.tools

"""HOTS preprocessing tools."""

import numpy as np
import pandas as pd


[docs] def build_df_from_containers( df_indiv: pd.DataFrame, tick_field: str, host_field: str, metrics: list[str] ) -> pd.DataFrame: """Aggregate individual consumption into host-level time-series. Ensures each host has an entry for every tick (filling missing values with zeros). """ # Group by timestamp and host, then sum the metrics df_agg = df_indiv.groupby([tick_field, host_field])[metrics].sum().reset_index() # Ensure all timestamp-host combinations exist (fill missing with 0) unique_ticks = df_indiv[tick_field].unique() unique_hosts = df_indiv[host_field].unique() # Create all combinations all_combinations = pd.MultiIndex.from_product( [unique_ticks, unique_hosts], names=[tick_field, host_field] ).to_frame(index=False) # Merge and fill missing values with 0 df_result = ( all_combinations.merge(df_agg, on=[tick_field, host_field], how="left") .fillna(0) .sort_values([tick_field, host_field]) .reset_index(drop=True) ) return df_result
[docs] def slice_by_time(df, col, tmin, tmax): """Get data between tmin and tmax in df.""" return df[(df[col] >= tmin) & (df[col] <= tmax)]
[docs] def check_missing_entries_df( df: pd.DataFrame, tick_field: str, indiv_field: str, host_field: str, metrics: list[str], ) -> pd.DataFrame: """Ensure all (timestamp, container) pairs exist and fill missing data.""" # 1. Prepare unique sorted values all_timestamps = np.sort(df[tick_field].unique()) all_containers = np.sort(df[indiv_field].unique()) # 2. Build full index and reindex full_index = pd.MultiIndex.from_product( [all_timestamps, all_containers], names=[tick_field, indiv_field], ) df = df.set_index([tick_field, indiv_field]).reindex(full_index).reset_index() # 3. Forward/backward fill host (intra-container) df[host_field] = df.groupby(indiv_field)[host_field].ffill().bfill() # 4. Fill metrics with zeros df[metrics] = df[metrics].fillna(0.0) # 5. Sort rows chronologically df.sort_values(by=[tick_field, indiv_field], ignore_index=True, inplace=True) return df