Skip to content

Metric Decomposition

import pandas as pd
import sympy as sp
import numpy as np

class ImpactExplainer:
    def __init__(self, expression_str, variables):
        self.var_names = list(variables)
        self.symbols = sp.symbols(self.var_names)
        self.expr = sp.expand(expression_str)

        # Split into individual terms (e.g., x1, x1*x2, etc.)
        self.terms = sp.Add.make_args(self.expr)

        # Pre-compile the formula for each term and its partial derivatives
        self.term_data = []
        for term in self.terms:
            self.term_data.append({
                'term': term,
                'func': sp.lambdify(self.symbols, term, "numpy"),
                # Derivatives of this specific term w.r.t each variable
                'derivs': [sp.lambdify(self.symbols, sp.diff(term, v), "numpy") for v in self.symbols]
            })

    def fit(self, X, y=None):
        base_vals = [X[v].to_numpy() for v in self.var_names]
        deltas = [X[f"{v}_delta"].to_numpy() for v in self.var_names]
        new_vals = [b + d for b, d in zip(base_vals, deltas)]

        # We will store impacts for singular variables and specific mixed terms
        impact_dict = {f"y_delta_{v}": np.zeros(len(X)) for v in self.var_names}

        for item in self.term_data:
            term = item['term']

            # 1. Calculate Total Delta for this specific term
            t_base = item['func'](*base_vals)
            t_new = item['func'](*new_vals)
            total_term_delta = t_new - t_base

            # 2. Extract Singular Contributions from this term
            term_singular_sum = np.zeros(len(X))
            for i, v in enumerate(self.var_names):
                d_val = item['derivs'][i](*base_vals)
                # Handle potential scalar returns from lambdify
                if not isinstance(d_val, np.ndarray): d_val = np.full(len(X), d_val)

                contribution = d_val * deltas[i]
                impact_dict[f"y_delta_{v}"] += contribution
                term_singular_sum += contribution

            # 3. If it's a mixed term, the "Leftover" is the specific mixed effect
            if len(term.free_symbols) > 1:
                mixed_val = total_term_delta - term_singular_sum
                impact_dict[f"y_delta_mixed_{term}"] = mixed_val

        self.impact_df = pd.DataFrame(impact_dict, index=X.index)
        # Reorder: Singular variables first, then mixed terms
        cols = [c for c in self.impact_df.columns if "mixed" not in c] + \
               [c for c in self.impact_df.columns if "mixed" in c]
        self.impact_df = self.impact_df[cols]

        return self

    def transform(self, X, y=None):
        return pd.concat([X, self.impact_df], axis=1)

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

    def to_sql(self, table_name="input_table"):
        """Generates a SQL query to perform the same attribution logic."""

        def sql_format(expr):
            return str(expr).replace('**', '^')

        # 1. Prepare Base CTE: Define the 'New' values
        new_val_cols = [f"({v} + {v}_delta) AS {v}_new" for v in self.var_names]

        # 2. Singular Impacts: Sum of partial derivatives across all terms
        singular_sqls = []
        for v in self.symbols:
            deriv = sp.diff(self.expr, v)
            singular_sqls.append(f"({sql_format(deriv)}) * {v}_delta AS y_delta_{v}")

        # 3. Mixed Impacts: Term-specific residue
        mixed_sqls = []
        for term in self.terms:
            if len(term.free_symbols) > 1:
                # Replace original vars with 'new' vars for the final state of the term
                term_new = term
                singular_contribution = 0
                for v in term.free_symbols:
                    term_new = term_new.subs(v, sp.Symbol(f"{v}_new"))
                    singular_contribution += sp.diff(term, v) * sp.Symbol(f"{v}_delta")

                term_mixed_expr = (term_new - term) - singular_contribution
                mixed_sqls.append(f"{sql_format(term_mixed_expr)} AS y_delta_mixed_{sql_format(term)}")

        # Assemble the SQL string
        sql = f"WITH base AS (\n  SELECT *,\n    {',\n    '.join(new_val_cols)}\n  FROM {table_name}\n)\n"
        sql += "SELECT\n  *,\n  "
        sql += ",\n  ".join(singular_sqls + mixed_sqls)
        sql += "\nFROM base;"
        return sql
explainer = ImpactExplainer("x1*x2 + x3/x1", ["x1", "x2", "x3"])

df_input = pd.DataFrame({
    'x1': [2], 'x2': [2], 'x3': [2],
    'x1_delta': [2], 'x2_delta': [1], 'x3_delta': [0]
})

analysis_df = explainer.fit_transform(df_input)
analysis_df

print(explainer.to_sql())

Since SQL doesn't have a symbolic engine like SymPy, we need to "hardcode" the logic for each term and its derivatives.

2026-02-16

Comments