dmlc · WeichenXu123 · Dec 4, 2024
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
@@ -1174,6 +1174,10 @@ def _run_job() -> Tuple[str, str]:
                     _train_booster,  # type: ignore
                     schema="data string",
                 )
+                # TODO: In spark connect, use `dataframe.mapInPandas(..., barrier=True)`
+                #  and remove `rdd.barrier().mapPartitions(lambda x: x)`
+                #  and for stage scheduling, similarly, use
+                #  `dataframe.mapInPandas(..., profile=...)` to set resource profile.
                 .rdd.barrier()
                 .mapPartitions(lambda x: x)
             )
@@ -1384,6 +1388,8 @@ def _run_on_gpu(self) -> bool:
 
         use_gpu_by_params = super()._run_on_gpu()
 
+        # TODO: To support spark connect, we can't use any SparkContext APIs,
+        #  and we can't read any spark configurations. Remove them
         if _is_local(_get_spark_session().sparkContext):
             # if it's local model, no need to check the spark configurations
             return use_gpu_by_params