pymc-labs · juanitorduz · Jan 21, 2025 · Nov 4, 2024 · Nov 4, 2024 · Nov 6, 2024
diff --git a/.github/workflows/test_notebook.yml b/.github/workflows/test_notebook.yml
@@ -31,6 +31,6 @@ jobs:
       run: |
         sudo apt-get install graphviz
         pip install -e .[docs]
-        pip install -e .[test]
+        pip install -e .[test,dag]
     - name: Run notebooks
       run: make run_notebooks
diff --git a/docs/source/notebooks/index.md b/docs/source/notebooks/index.md
@@ -7,9 +7,12 @@ Here you will find a collection of examples and how-to guides for using PyMC-Mar
 :caption: MMMs
 :maxdepth: 1
 
+mmm/mmm_example
+mmm/mmm_budget_allocation_example
 mmm/mmm_allocation_assessment
 mmm/mmm_budget_allocation_example
 mmm/mmm_case_study
+mmm/mmm_causal_identification
 mmm/mmm_components
 mmm/mmm_counterfactuals
 mmm/mmm_evaluation

diff --git a/docs/source/notebooks/mmm/causal_model.png b/docs/source/notebooks/mmm/causal_model.png
diff --git a/docs/source/notebooks/mmm/mmm_causal_identification.ipynb b/docs/source/notebooks/mmm/mmm_causal_identification.ipynb
diff --git a/environment.yml b/environment.yml
@@ -35,6 +35,8 @@ dependencies:
 - sphinx-design
 - watermark
 - typing
+- networkx
+- dowhy
 # lint
 - mypy
 - pandas-stubs

diff --git a/pymc_marketing/mmm/causal.py b/pymc_marketing/mmm/causal.py
@@ -0,0 +1,158 @@
+#   Copyright 2025 The PyMC Labs Developers
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+"""Causal identification class."""
+
+import warnings
+
+import pandas as pd
+
+try:
+    from dowhy import CausalModel
+except ImportError:
+
+    class LazyCausalModel:
+        """Lazy import of dowhy's CausalModel."""
+
+        def __init__(self, *args, **kwargs):
+            msg = (
+                "To use Causal Graph functionality, please install the optional dependencies with: "
+                "pip install pymc-marketing[dag]"
+            )
+            raise ImportError(msg)
+
+    CausalModel = LazyCausalModel
+
+
+class CausalGraphModel:
+    """Represent a causal model based on a Directed Acyclic Graph (DAG).
+
+    Provides methods to analyze causal relationships and determine the minimal adjustment set
+    for backdoor adjustment between treatment and outcome variables.
+
+    Parameters
+    ----------
+    causal_model : CausalModel
+        An instance of dowhy's CausalModel, representing the causal graph and its relationships.
+    treatment : list[str]
+        A list of treatment variable names.
+    outcome : str
+        The outcome variable name.
+
+    References
+    ----------
+    .. [1] https://github.com/microsoft/dowhy
+    """
+
+    def __init__(
+        self, causal_model: CausalModel, treatment: list[str] | tuple[str], outcome: str
+    ) -> None:
+        self.causal_model = causal_model
+        self.treatment = treatment
+        self.outcome = outcome
+
+    @classmethod
+    def build_graphical_model(
+        cls, graph: str, treatment: list[str] | tuple[str], outcome: str
+    ) -> "CausalGraphModel":
+        """Create a CausalGraphModel from a string representation of a graph.
+
+        Parameters
+        ----------
+        graph : str
+            A string representation of the graph (e.g., String in DOT format).
+        treatment : list[str]
+            A list of treatment variable names.
+        outcome : str
+            The outcome variable name.
+
+        Returns
+        -------
+        CausalGraphModel
+            An instance of CausalGraphModel constructed from the given graph string.
+        """
+        causal_model = CausalModel(
+            data=pd.DataFrame(), graph=graph, treatment=treatment, outcome=outcome
+        )
+        return cls(causal_model, treatment, outcome)
+
+    def get_backdoor_paths(self) -> list[list[str]]:
+        """Find all backdoor paths between the combined treatment and outcome variables.
+
+        Returns
+        -------
+        list[list[str]]
+            A list of backdoor paths, where each path is represented as a list of variable names.
+
+        References
+        ----------
+        .. [1] Causal Inference in Statistics: A Primer
+        By Judea Pearl, Madelyn Glymour, Nicholas P. Jewell · 2016
+        """
+        # Use DoWhy's internal method to get backdoor paths for all treatments combined
+        return self.causal_model._graph.get_backdoor_paths(
+            nodes1=self.treatment, nodes2=[self.outcome]
+        )
+
+    def get_unique_adjustment_nodes(self) -> list[str]:
+        """Compute the minimal adjustment set required for backdoor adjustment across all treatments.
+
+        Returns
+        -------
+        list[str]
+            A list of unique adjustment variables needed to block all backdoor paths.
+        """
+        paths = self.get_backdoor_paths()
+        # Flatten paths and exclude treatments and outcome from adjustment set
+        adjustment_nodes = set(
+            node
+            for path in paths
+            for node in path
+            if node not in self.treatment and node != self.outcome
+        )
+        return list(adjustment_nodes)
+
+    def compute_adjustment_sets(
+        self,
+        channel_columns: list[str] | tuple[str],
+        control_columns: list[str] | None = None,
+    ) -> list[str] | None:
+        """Compute minimal adjustment sets and handle warnings."""
+        channel_columns = list(channel_columns)
+        if control_columns is None:
+            return control_columns
+
+        self.adjustment_set = self.get_unique_adjustment_nodes()
+
+        common_controls = set(control_columns).intersection(self.adjustment_set)
+        unique_controls = set(control_columns) - set(self.adjustment_set)
+
+        if unique_controls:
+            warnings.warn(
+                f"Columns {unique_controls} are not in the adjustment set. Controls are being modified.",
+                stacklevel=2,
+            )
+
+        control_columns = list(common_controls - set(channel_columns))
+
+        self.minimal_adjustment_set = control_columns + list(channel_columns)
+
+        for column in self.adjustment_set:
+            if column not in control_columns and column not in channel_columns:
+                warnings.warn(
+                    f"""Column {column} in adjustment set not found in data.
+                    Not controlling for this may induce bias in treatment effect estimates.""",
+                    stacklevel=2,
+                )
+
+        return control_columns
diff --git a/pymc_marketing/mmm/mmm.py b/pymc_marketing/mmm/mmm.py
@@ -33,6 +33,7 @@
 
 from pymc_marketing.hsgp_kwargs import HSGPKwargs
 from pymc_marketing.mmm.base import BaseValidateMMM
+from pymc_marketing.mmm.causal import CausalGraphModel
 from pymc_marketing.mmm.components.adstock import (
     AdstockTransformation,
     adstock_from_dict,
@@ -115,6 +116,17 @@ def __init__(
         adstock_first: bool = Field(
             True, description="Whether to apply adstock first."
         ),
+        dag: str | None = Field(
+            None,
+            description="Optional DAG provided as a string Dot format for causal identification.",
+        ),
+        treatment_nodes: list[str] | tuple[str] | None = Field(
+            None,
+            description="Column names of the variables of interest to identify causal effects on outcome.",
+        ),
+        outcome_node: str | None = Field(
+            None, description="Name of the outcome variable."
+        ),
     ) -> None:
         """Define the constructor method.
 
@@ -151,6 +163,12 @@ def __init__(
             Number of Fourier modes to model yearly seasonality, by default None.
         adstock_first : bool, optional
             Whether to apply adstock first, by default True.
+        dag : Optional[str], optional
+            Optional DAG provided as a string Dot format for causal modeling, by default None.
+        treatment_nodes : Optional[list[str]], optional
+            Column names of the variables of interest to identify causal effects on outcome.
+        outcome_node : Optional[str], optional
+            Name of the outcome variable, by default None.
         """
         self.control_columns = control_columns
         self.time_varying_intercept = time_varying_intercept
@@ -180,6 +198,37 @@ def __init__(
         )
 
         self.yearly_seasonality = yearly_seasonality
+
+        self.dag = dag
+        self.treatment_nodes = treatment_nodes
+        self.outcome_node = outcome_node
+
+        # Initialize causal graph if provided
+        if self.dag is not None and self.outcome_node is not None:
+            if self.treatment_nodes is None:
+                self.treatment_nodes = self.channel_columns
+                warnings.warn(
+                    "No treatment nodes provided, using channel columns as treatment nodes.",
+                    stacklevel=2,
+                )
+            self.causal_graphical_model = CausalGraphModel.build_graphical_model(
+                graph=self.dag,
+                treatment=self.treatment_nodes,
+                outcome=self.outcome_node,
+            )
+
+            self.control_columns = self.causal_graphical_model.compute_adjustment_sets(
+                control_columns=self.control_columns,
+                channel_columns=self.channel_columns,
+            )
+
+            if "yearly_seasonality" not in self.causal_graphical_model.adjustment_set:
+                warnings.warn(
+                    "Yearly seasonality excluded as it's not required for adjustment.",
+                    stacklevel=2,
+                )
+                self.yearly_seasonality = None
+
         if self.yearly_seasonality is not None:
             self.yearly_fourier = YearlyFourier(
                 n_order=self.yearly_seasonality,
@@ -305,6 +354,9 @@ def create_idata_attrs(self) -> dict[str, str]:
         attrs["yearly_seasonality"] = json.dumps(self.yearly_seasonality)
         attrs["time_varying_intercept"] = json.dumps(self.time_varying_intercept)
         attrs["time_varying_media"] = json.dumps(self.time_varying_media)
+        attrs["dag"] = json.dumps(self.dag)
+        attrs["treatment_nodes"] = json.dumps(self.treatment_nodes)
+        attrs["outcome_node"] = json.dumps(self.outcome_node)
 
         return attrs
 
@@ -680,6 +732,9 @@ def attrs_to_init_kwargs(cls, attrs) -> dict[str, Any]:
             "time_varying_media": json.loads(attrs.get("time_varying_media", "false")),
             "validate_data": json.loads(attrs["validate_data"]),
             "sampler_config": json.loads(attrs["sampler_config"]),
+            "dag": json.loads(attrs.get("dag", "null")),
+            "treatment_nodes": json.loads(attrs.get("treatment_nodes", "null")),
+            "outcome_node": json.loads(attrs.get("outcome_node", "null")),
         }
 
     def _data_setter(

diff --git a/pyproject.toml b/pyproject.toml
@@ -40,6 +40,10 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
+dag = [
+    "dowhy",
+    "networkx",
+]
 docs = [
     "blackjax",
     "fastprogress",
@@ -63,6 +67,8 @@ docs = [
     "sphinxext-opengraph",
     "watermark",
     "mlflow>=2.0.0",
+    "networkx",
+    "dowhy",
 ]
 lint = ["mypy", "pandas-stubs", "pre-commit>=2.19.0", "ruff>=0.1.4"]
 test = [
@@ -78,6 +84,8 @@ test = [
     "pytest-mock>=3.14.0",
     "pytest>=7.0.1",
     "mlflow>=2.0.0",
+    "networkx",
+    "dowhy",
 ]
 
 [tool.hatch.build.targets.sdist]