[SPARK-47224][PS][TESTS] Split test_split_apply_basic and `test_spl…

…it_apply_adv` ### What changes were proposed in this pull request? Split `test_split_apply_basic`/`test_split_apply_adv` and their parity tests ### Why are the changes needed? it is still slow, split it for testing parallelism ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes apache#45332 from zhengruifeng/ps_test_split_apply_basic. Authored-by: Ruifeng Zheng <[email protected]> Signed-off-by: Hyukjin Kwon <[email protected]>
rohan-flutterint · Feb 29, 2024 · 944a00d · 944a00d
1 parent b0a027c
commit 944a00d
Show file tree

Hide file tree

Showing 13 changed files with 389 additions and 21 deletions.
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
@@ -904,9 +904,13 @@ def __hash__(self):
         "pyspark.pandas.tests.groupby.test_rank",
         "pyspark.pandas.tests.groupby.test_size",
         "pyspark.pandas.tests.groupby.test_split_apply",
-        "pyspark.pandas.tests.groupby.test_split_apply_adv",
-        "pyspark.pandas.tests.groupby.test_split_apply_basic",
+        "pyspark.pandas.tests.groupby.test_split_apply_count",
+        "pyspark.pandas.tests.groupby.test_split_apply_first",
+        "pyspark.pandas.tests.groupby.test_split_apply_last",
         "pyspark.pandas.tests.groupby.test_split_apply_min_max",
+        "pyspark.pandas.tests.groupby.test_split_apply_skew",
+        "pyspark.pandas.tests.groupby.test_split_apply_std",
+        "pyspark.pandas.tests.groupby.test_split_apply_var",
         "pyspark.pandas.tests.groupby.test_stat",
         "pyspark.pandas.tests.groupby.test_stat_adv",
         "pyspark.pandas.tests.groupby.test_stat_ddof",
@@ -1180,9 +1184,13 @@ def __hash__(self):
         "pyspark.pandas.tests.connect.groupby.test_parity_cumulative",
         "pyspark.pandas.tests.connect.groupby.test_parity_missing_data",
         "pyspark.pandas.tests.connect.groupby.test_parity_split_apply",
-        "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_adv",
-        "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_basic",
+        "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_count",
+        "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_first",
+        "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_last",
         "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_min_max",
+        "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_skew",
+        "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_std",
+        "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_var",
         "pyspark.pandas.tests.connect.series.test_parity_datetime",
         "pyspark.pandas.tests.connect.series.test_parity_string_ops_adv",
         "pyspark.pandas.tests.connect.series.test_parity_string_ops_basic",

diff --git a/.../groupby/test_parity_split_apply_basic.py → .../groupby/test_parity_split_apply_count.py b/.../groupby/test_parity_split_apply_basic.py → .../groupby/test_parity_split_apply_count.py
@@ -16,21 +16,21 @@
 #
 import unittest
 
-from pyspark.pandas.tests.groupby.test_split_apply_basic import GroupbySplitApplyBasicMixin
+from pyspark.pandas.tests.groupby.test_split_apply_count import GroupbySplitApplyCountMixin
 from pyspark.testing.connectutils import ReusedConnectTestCase
 from pyspark.testing.pandasutils import PandasOnSparkTestUtils
 
 
-class GroupbySplitApplyBasicParityTests(
-    GroupbySplitApplyBasicMixin,
+class GroupbySplitApplyCountParityTests(
+    GroupbySplitApplyCountMixin,
     PandasOnSparkTestUtils,
     ReusedConnectTestCase,
 ):
     pass
 
 
 if __name__ == "__main__":
-    from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_basic import *  # noqa
+    from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_count import *  # noqa
 
     try:
         import xmlrunner  # type: ignore[import]

diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_first.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_first.py
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark.pandas.tests.groupby.test_split_apply_first import GroupbySplitApplyFirstMixin
+from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+
+
+class GroupbySplitApplyFirstParityTests(
+    GroupbySplitApplyFirstMixin,
+    PandasOnSparkTestUtils,
+    ReusedConnectTestCase,
+):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_first import *  # noqa
+
+    try:
+        import xmlrunner  # type: ignore[import]
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_last.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_last.py
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark.pandas.tests.groupby.test_split_apply_last import GroupbySplitApplyLastMixin
+from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+
+
+class GroupbySplitApplyLastParityTests(
+    GroupbySplitApplyLastMixin,
+    PandasOnSparkTestUtils,
+    ReusedConnectTestCase,
+):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_last import *  # noqa
+
+    try:
+        import xmlrunner  # type: ignore[import]
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_skew.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_skew.py
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark.pandas.tests.groupby.test_split_apply_skew import GroupbySplitApplySkewMixin
+from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+
+
+class GroupbySplitApplySkewParityTests(
+    GroupbySplitApplySkewMixin,
+    PandasOnSparkTestUtils,
+    ReusedConnectTestCase,
+):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_skew import *  # noqa
+
+    try:
+        import xmlrunner  # type: ignore[import]
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/...ct/groupby/test_parity_split_apply_adv.py → ...ct/groupby/test_parity_split_apply_std.py b/...ct/groupby/test_parity_split_apply_adv.py → ...ct/groupby/test_parity_split_apply_std.py
@@ -16,21 +16,21 @@
 #
 import unittest
 
-from pyspark.pandas.tests.groupby.test_split_apply_adv import GroupbySplitApplyAdvMixin
+from pyspark.pandas.tests.groupby.test_split_apply_std import GroupbySplitApplyStdMixin
 from pyspark.testing.connectutils import ReusedConnectTestCase
 from pyspark.testing.pandasutils import PandasOnSparkTestUtils
 
 
-class GroupbySplitApplyAdvParityTests(
-    GroupbySplitApplyAdvMixin,
+class GroupbySplitApplyStdParityTests(
+    GroupbySplitApplyStdMixin,
     PandasOnSparkTestUtils,
     ReusedConnectTestCase,
 ):
     pass
 
 
 if __name__ == "__main__":
-    from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_adv import *  # noqa
+    from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_std import *  # noqa
 
     try:
         import xmlrunner  # type: ignore[import]

diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_var.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_var.py
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark.pandas.tests.groupby.test_split_apply_var import GroupbySplitApplyVarMixin
+from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+
+
+class GroupbySplitApplyVarParityTests(
+    GroupbySplitApplyVarMixin,
+    PandasOnSparkTestUtils,
+    ReusedConnectTestCase,
+):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_var import *  # noqa
+
+    try:
+        import xmlrunner  # type: ignore[import]
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/groupby/test_split_apply_count.py b/python/pyspark/pandas/tests/groupby/test_split_apply_count.py
@@ -0,0 +1,49 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
+from pyspark.testing.sqlutils import SQLTestUtils
+from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyTestingFuncMixin
+
+
+class GroupbySplitApplyCountMixin(GroupbySplitApplyTestingFuncMixin):
+    def test_split_apply_combine_on_series(self):
+        funcs = [
+            ((True, False), ["count"]),
+        ]
+        self._test_split_apply_func(funcs)
+
+
+class GroupbySplitApplyCountTests(
+    GroupbySplitApplyCountMixin,
+    PandasOnSparkTestCase,
+    SQLTestUtils,
+):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.groupby.test_split_apply_count import *  # noqa: F401
+
+    try:
+        import xmlrunner
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/groupby/test_split_apply_first.py b/python/pyspark/pandas/tests/groupby/test_split_apply_first.py
@@ -0,0 +1,49 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
+from pyspark.testing.sqlutils import SQLTestUtils
+from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyTestingFuncMixin
+
+
+class GroupbySplitApplyFirstMixin(GroupbySplitApplyTestingFuncMixin):
+    def test_split_apply_combine_on_series(self):
+        funcs = [
+            ((True, False), ["first"]),
+        ]
+        self._test_split_apply_func(funcs)
+
+
+class GroupbySplitApplyFirstTests(
+    GroupbySplitApplyFirstMixin,
+    PandasOnSparkTestCase,
+    SQLTestUtils,
+):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.groupby.test_split_apply_first import *  # noqa: F401
+
+    try:
+        import xmlrunner
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/...s/tests/groupby/test_split_apply_basic.py → ...as/tests/groupby/test_split_apply_last.py b/...s/tests/groupby/test_split_apply_basic.py → ...as/tests/groupby/test_split_apply_last.py
@@ -21,24 +21,24 @@
 from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyTestingFuncMixin
 
 
-class GroupbySplitApplyBasicMixin(GroupbySplitApplyTestingFuncMixin):
+class GroupbySplitApplyLastMixin(GroupbySplitApplyTestingFuncMixin):
     def test_split_apply_combine_on_series(self):
         funcs = [
-            ((True, False), ["count", "first", "last"]),
+            ((True, False), ["last"]),
         ]
         self._test_split_apply_func(funcs)
 
 
 class GroupbySplitApplyBasicTests(
-    GroupbySplitApplyBasicMixin,
+    GroupbySplitApplyLastMixin,
     PandasOnSparkTestCase,
     SQLTestUtils,
 ):
     pass
 
 
 if __name__ == "__main__":
-    from pyspark.pandas.tests.groupby.test_split_apply_basic import *  # noqa: F401
+    from pyspark.pandas.tests.groupby.test_split_apply_last import *  # noqa: F401
 
     try:
         import xmlrunner