Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adjust duplicate handling in xpath scraper #5100

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions pkg/scraper/mapped.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ func (s mappedConfig) postProcess(ctx context.Context, q mappedQuery, attrConfig
if attrConfig.hasSplit() {
results := attrConfig.splitString(result)
// skip cleaning when the query is used for searching
if q.getType() == SearchQuery {
if q.getType() == SearchQuery || attrConfig.hasDuplicate() {
return results
}
results = attrConfig.cleanResults(results)
Expand All @@ -100,7 +100,7 @@ func (s mappedConfig) postProcess(ctx context.Context, q mappedQuery, attrConfig
ret = append(ret, text)
}
// skip cleaning when the query is used for searching
if q.getType() == SearchQuery {
if q.getType() == SearchQuery || attrConfig.hasDuplicate() {
return ret
}
ret = attrConfig.cleanResults(ret)
Expand Down Expand Up @@ -660,6 +660,7 @@ type mappedScraperAttrConfig struct {
PostProcess []mappedPostProcessAction `yaml:"postProcess"`
Concat string `yaml:"concat"`
Split string `yaml:"split"`
Duplicate bool `yaml:"duplicate"`

postProcessActions []postProcessAction

Expand Down Expand Up @@ -743,6 +744,10 @@ func (c mappedScraperAttrConfig) hasSplit() bool {
return c.Split != ""
}

func (c mappedScraperAttrConfig) hasDuplicate() bool {
return c.Duplicate
}

func (c mappedScraperAttrConfig) concatenateResults(nodes []string) string {
separator := c.Concat
return strings.Join(nodes, separator)
Expand Down
33 changes: 27 additions & 6 deletions pkg/scraper/xpath_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -464,13 +464,13 @@ const sceneHTML = `
<div class="pornstarsWrapper">
Pornstars:&nbsp;
<a class="pstar-list-btn js-mxp" data-mxptype="Pornstar"
data-mxptext="Alex D" href="/pornstar/alex-d">Alex D
data-mxptext="Alex D" data-gender="male" href="/pornstar/alex-d">Alex D
</a>
, <a class="pstar-list-btn js-mxp" data-mxptype="Pornstar"
data-mxptext="Mia Malkova" href="/pornstar/mia-malkova">
data-mxptext="Mia Malkova" data-gender="female" href="/pornstar/mia-malkova">
</a>
, <a class="pstar-list-btn js-mxp" data-mxptype="Pornstar"
data-mxptext="Riley Reid" href="/pornstar/riley-reid">Riley Reid
data-mxptext="Riley Reid" data-gender="female" href="/pornstar/riley-reid">Riley Reid
</a>
<div class="tooltipTrig suggestBtn" data-title="Add a pornstar">
<a class="add-btn-small add-pornstar-btn-2">+
Expand Down Expand Up @@ -570,6 +570,10 @@ func makeSceneXPathConfig() mappedScraper {
performerConfig := make(mappedConfig)
performerConfig["Name"] = makeSimpleAttrConfig(`$performerElem/@data-mxptext`)
performerConfig["URL"] = makeSimpleAttrConfig(`$performerElem/@href`)
performerConfig["Gender"] = mappedScraperAttrConfig{
Selector: `$performerElem/@data-gender`,
Duplicate: true,
}
config.Performers.mappedConfig = performerConfig

studioConfig := make(mappedConfig)
Expand Down Expand Up @@ -636,7 +640,7 @@ func verifyMovies(t *testing.T, expectedMovieNames []string, actualMovies []*mod
}
}

func verifyPerformers(t *testing.T, expectedNames []string, expectedURLs []string, actualPerformers []*models.ScrapedPerformer) {
func verifyPerformers(t *testing.T, expectedNames []string, expectedURLs []string, expectedGenders []string, actualPerformers []*models.ScrapedPerformer) {
t.Helper()

i := 0
Expand All @@ -645,24 +649,35 @@ func verifyPerformers(t *testing.T, expectedNames []string, expectedURLs []strin
actualName := ""
expectedURL := ""
actualURL := ""
expectedGender := ""
actualGender := ""
if i < len(expectedNames) {
expectedName = expectedNames[i]
}
if i < len(expectedURLs) {
expectedURL = expectedURLs[i]
}
if i < len(expectedGenders) {
expectedGender = expectedGenders[i]
}
if i < len(actualPerformers) {
actualName = *actualPerformers[i].Name
if actualPerformers[i].URL != nil {
actualURL = *actualPerformers[i].URL
}
if actualPerformers[i].Gender != nil {
actualGender = *actualPerformers[i].Gender
}
}

if expectedName != actualName {
t.Errorf("Expected performer name %s, got %s", expectedName, actualName)
}
if expectedURL != actualURL {
t.Errorf("Expected performer URL %s, got %s", expectedName, actualName)
t.Errorf("Expected performer URL %s, got %s", expectedURL, actualURL)
}
if expectedGender != actualGender {
t.Errorf("Expected performer Gender %s, got %s", expectedGender, actualGender)
}
i++
}
Expand Down Expand Up @@ -729,7 +744,13 @@ func TestApplySceneXPathConfig(t *testing.T) {
"/pornstar/riley-reid",
}

verifyPerformers(t, expectedPerformerNames, expectedPerformerURLs, scene.Performers)
expectedPerformerGenders := []string{
"male",
"female",
"female",
}

verifyPerformers(t, expectedPerformerNames, expectedPerformerURLs, expectedPerformerGenders, scene.Performers)

const expectedStudioName = "Sis Loves Me"
const expectedStudioURL = "/channels/sis-loves-me"
Expand Down
1 change: 1 addition & 0 deletions ui/v2.5/src/docs/en/Manual/ScraperDevelopment.md
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,7 @@ Replaces `2001 to 2003` with `2001-2003`.
Additionally, there are a number of fixed post-processing fields that are specified at the attribute level (not in `postProcess`) that are performed after the `postProcess` operations:
* `concat`: if an xpath matches multiple elements, and `concat` is present, then all of the elements will be concatenated together
* `split`: the inverse of `concat`. Splits a string to more elements using the separator given. For more info and examples have a look at PR [#579](https://github.com/stashapp/stash/pull/579)
* `duplicate`: if an xpath matches multiple elements, and `duplicate` is `true`, then all of the elements will be returned without removing duplicates.

Example:
```yaml
Expand Down
Loading