Skip to content

Commit

Permalink
[SPARK-47224][PS][TESTS] Split test_split_apply_basic and `test_spl…
Browse files Browse the repository at this point in the history
…it_apply_adv`

### What changes were proposed in this pull request?
Split `test_split_apply_basic`/`test_split_apply_adv` and their parity tests

### Why are the changes needed?
it is still slow, split it for testing parallelism

### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?
ci

### Was this patch authored or co-authored using generative AI tooling?
no

Closes apache#45332 from zhengruifeng/ps_test_split_apply_basic.

Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
  • Loading branch information
zhengruifeng authored and HyukjinKwon committed Feb 29, 2024
1 parent b0a027c commit 944a00d
Show file tree
Hide file tree
Showing 13 changed files with 389 additions and 21 deletions.
16 changes: 12 additions & 4 deletions dev/sparktestsupport/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -904,9 +904,13 @@ def __hash__(self):
"pyspark.pandas.tests.groupby.test_rank",
"pyspark.pandas.tests.groupby.test_size",
"pyspark.pandas.tests.groupby.test_split_apply",
"pyspark.pandas.tests.groupby.test_split_apply_adv",
"pyspark.pandas.tests.groupby.test_split_apply_basic",
"pyspark.pandas.tests.groupby.test_split_apply_count",
"pyspark.pandas.tests.groupby.test_split_apply_first",
"pyspark.pandas.tests.groupby.test_split_apply_last",
"pyspark.pandas.tests.groupby.test_split_apply_min_max",
"pyspark.pandas.tests.groupby.test_split_apply_skew",
"pyspark.pandas.tests.groupby.test_split_apply_std",
"pyspark.pandas.tests.groupby.test_split_apply_var",
"pyspark.pandas.tests.groupby.test_stat",
"pyspark.pandas.tests.groupby.test_stat_adv",
"pyspark.pandas.tests.groupby.test_stat_ddof",
Expand Down Expand Up @@ -1180,9 +1184,13 @@ def __hash__(self):
"pyspark.pandas.tests.connect.groupby.test_parity_cumulative",
"pyspark.pandas.tests.connect.groupby.test_parity_missing_data",
"pyspark.pandas.tests.connect.groupby.test_parity_split_apply",
"pyspark.pandas.tests.connect.groupby.test_parity_split_apply_adv",
"pyspark.pandas.tests.connect.groupby.test_parity_split_apply_basic",
"pyspark.pandas.tests.connect.groupby.test_parity_split_apply_count",
"pyspark.pandas.tests.connect.groupby.test_parity_split_apply_first",
"pyspark.pandas.tests.connect.groupby.test_parity_split_apply_last",
"pyspark.pandas.tests.connect.groupby.test_parity_split_apply_min_max",
"pyspark.pandas.tests.connect.groupby.test_parity_split_apply_skew",
"pyspark.pandas.tests.connect.groupby.test_parity_split_apply_std",
"pyspark.pandas.tests.connect.groupby.test_parity_split_apply_var",
"pyspark.pandas.tests.connect.series.test_parity_datetime",
"pyspark.pandas.tests.connect.series.test_parity_string_ops_adv",
"pyspark.pandas.tests.connect.series.test_parity_string_ops_basic",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,21 @@
#
import unittest

from pyspark.pandas.tests.groupby.test_split_apply_basic import GroupbySplitApplyBasicMixin
from pyspark.pandas.tests.groupby.test_split_apply_count import GroupbySplitApplyCountMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
from pyspark.testing.pandasutils import PandasOnSparkTestUtils


class GroupbySplitApplyBasicParityTests(
GroupbySplitApplyBasicMixin,
class GroupbySplitApplyCountParityTests(
GroupbySplitApplyCountMixin,
PandasOnSparkTestUtils,
ReusedConnectTestCase,
):
pass


if __name__ == "__main__":
from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_basic import * # noqa
from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_count import * # noqa

try:
import xmlrunner # type: ignore[import]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import unittest

from pyspark.pandas.tests.groupby.test_split_apply_first import GroupbySplitApplyFirstMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
from pyspark.testing.pandasutils import PandasOnSparkTestUtils


class GroupbySplitApplyFirstParityTests(
GroupbySplitApplyFirstMixin,
PandasOnSparkTestUtils,
ReusedConnectTestCase,
):
pass


if __name__ == "__main__":
from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_first import * # noqa

try:
import xmlrunner # type: ignore[import]

testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import unittest

from pyspark.pandas.tests.groupby.test_split_apply_last import GroupbySplitApplyLastMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
from pyspark.testing.pandasutils import PandasOnSparkTestUtils


class GroupbySplitApplyLastParityTests(
GroupbySplitApplyLastMixin,
PandasOnSparkTestUtils,
ReusedConnectTestCase,
):
pass


if __name__ == "__main__":
from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_last import * # noqa

try:
import xmlrunner # type: ignore[import]

testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import unittest

from pyspark.pandas.tests.groupby.test_split_apply_skew import GroupbySplitApplySkewMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
from pyspark.testing.pandasutils import PandasOnSparkTestUtils


class GroupbySplitApplySkewParityTests(
GroupbySplitApplySkewMixin,
PandasOnSparkTestUtils,
ReusedConnectTestCase,
):
pass


if __name__ == "__main__":
from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_skew import * # noqa

try:
import xmlrunner # type: ignore[import]

testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,21 @@
#
import unittest

from pyspark.pandas.tests.groupby.test_split_apply_adv import GroupbySplitApplyAdvMixin
from pyspark.pandas.tests.groupby.test_split_apply_std import GroupbySplitApplyStdMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
from pyspark.testing.pandasutils import PandasOnSparkTestUtils


class GroupbySplitApplyAdvParityTests(
GroupbySplitApplyAdvMixin,
class GroupbySplitApplyStdParityTests(
GroupbySplitApplyStdMixin,
PandasOnSparkTestUtils,
ReusedConnectTestCase,
):
pass


if __name__ == "__main__":
from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_adv import * # noqa
from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_std import * # noqa

try:
import xmlrunner # type: ignore[import]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import unittest

from pyspark.pandas.tests.groupby.test_split_apply_var import GroupbySplitApplyVarMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
from pyspark.testing.pandasutils import PandasOnSparkTestUtils


class GroupbySplitApplyVarParityTests(
GroupbySplitApplyVarMixin,
PandasOnSparkTestUtils,
ReusedConnectTestCase,
):
pass


if __name__ == "__main__":
from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_var import * # noqa

try:
import xmlrunner # type: ignore[import]

testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)
49 changes: 49 additions & 0 deletions python/pyspark/pandas/tests/groupby/test_split_apply_count.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import unittest

from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils
from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyTestingFuncMixin


class GroupbySplitApplyCountMixin(GroupbySplitApplyTestingFuncMixin):
def test_split_apply_combine_on_series(self):
funcs = [
((True, False), ["count"]),
]
self._test_split_apply_func(funcs)


class GroupbySplitApplyCountTests(
GroupbySplitApplyCountMixin,
PandasOnSparkTestCase,
SQLTestUtils,
):
pass


if __name__ == "__main__":
from pyspark.pandas.tests.groupby.test_split_apply_count import * # noqa: F401

try:
import xmlrunner

testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)
49 changes: 49 additions & 0 deletions python/pyspark/pandas/tests/groupby/test_split_apply_first.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import unittest

from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils
from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyTestingFuncMixin


class GroupbySplitApplyFirstMixin(GroupbySplitApplyTestingFuncMixin):
def test_split_apply_combine_on_series(self):
funcs = [
((True, False), ["first"]),
]
self._test_split_apply_func(funcs)


class GroupbySplitApplyFirstTests(
GroupbySplitApplyFirstMixin,
PandasOnSparkTestCase,
SQLTestUtils,
):
pass


if __name__ == "__main__":
from pyspark.pandas.tests.groupby.test_split_apply_first import * # noqa: F401

try:
import xmlrunner

testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)
Original file line number Diff line number Diff line change
Expand Up @@ -21,24 +21,24 @@
from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyTestingFuncMixin


class GroupbySplitApplyBasicMixin(GroupbySplitApplyTestingFuncMixin):
class GroupbySplitApplyLastMixin(GroupbySplitApplyTestingFuncMixin):
def test_split_apply_combine_on_series(self):
funcs = [
((True, False), ["count", "first", "last"]),
((True, False), ["last"]),
]
self._test_split_apply_func(funcs)


class GroupbySplitApplyBasicTests(
GroupbySplitApplyBasicMixin,
GroupbySplitApplyLastMixin,
PandasOnSparkTestCase,
SQLTestUtils,
):
pass


if __name__ == "__main__":
from pyspark.pandas.tests.groupby.test_split_apply_basic import * # noqa: F401
from pyspark.pandas.tests.groupby.test_split_apply_last import * # noqa: F401

try:
import xmlrunner
Expand Down
Loading

0 comments on commit 944a00d

Please sign in to comment.