Skip to content

Commit

Permalink
Feat/index reusable content (#69)
Browse files Browse the repository at this point in the history
* fix(Lucene.Core): upgrade to new version of Kentico, webpage events which are listened to by Lucene changed from archive to unpublish

* feat(Lucene.Core): Mapping reusable items

* feat(docs): add reusable content types

* fix(Lucene.Admin): wrong placeholder in content type selection

* fix(Lucene.Admin): reusable content provider comment

* feat(docs): reusable content types images

* feat(Lucene.Core): indexing reusable content example strategy

* feat(docs): indexing reusable content

* fix(docs): remove invalid images

* Add files via upload

* mrak .jpg as binary in .gitattributes

* feat(Lucene.Core): indexing reusable content example

* feat(Lucene.Core): indexing reusable content do not execute rebuild query if no content type is selected

* feat(Lucene.Admin): update packages

* fix(docs): typo

* fix(DancingGoat): refactor string initialization with string.empty

* refactor(Lucene.Core, Admin): refactor string assignments

* refactor(Lucene.Core): remove specific info providers

* refactor(Lucene.Core): use iinfoproviders

* refactor(docs): change "" to string.Empty

* refactor(Lucene.Core): remove redundand info provider definitions

* refactor(DancingGoat): conditional blocks

---------

Co-authored-by: Miloslav Hlaváč <[email protected]>
  • Loading branch information
bkapustik and Lahvac authored Sep 3, 2024
1 parent 2e1ae4b commit 211efdb
Show file tree
Hide file tree
Showing 43 changed files with 3,976 additions and 3,428 deletions.
13 changes: 7 additions & 6 deletions docs/Custom-index-strategy.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ public class ExampleSearchIndexingStrategy : DefaultLuceneIndexingStrategy
{
var document = new Document();

string sortableTitle = "";
string title = "";
string sortableTitle = string.Empty;
string title = string.Empty;

// IIndexEventItemModel could be a reusable content item or a web page item, so we use
// pattern matching to get access to the web page item specific type and fields
Expand Down Expand Up @@ -60,7 +60,7 @@ public class ExampleSearchIndexingStrategy : DefaultLuceneIndexingStrategy

var article = page.ArticlePageArticle.FirstOrDefault();

sortableTitle = title = article?.ArticleTitle ?? "";
sortableTitle = title = article?.ArticleTitle ?? string.Empty;
}

document.Add(new TextField(nameof(GlobalSearchResultModel.Title), title, Field.Store.YES));
Expand All @@ -83,6 +83,7 @@ public static class BaseDocumentProperties
public const string ID = "ID";
public const string CONTENT_TYPE_NAME = "ContentTypeName";
// ...
// This field is only added to the document if the indexed item is a web page.
public const string URL = "Url";
}
```
Expand Down Expand Up @@ -135,9 +136,9 @@ public class ExampleSearchIndexingStrategy : DefaultLuceneIndexingStrategy
{
var document = new Document();

string sortableTitle = "";
string title = "";
string contentType = "";
string sortableTitle = string.Empty;
string title = string.Empty;
string contentType = string.Empty;

if (item is IndexEventWebPageItemModel webpageItem &&
string.Equals(indexedModel.ContentTypeName, ArticlePage.CONTENT_TYPE_NAME, StringComparison.OrdinalIgnorecase))
Expand Down
15 changes: 9 additions & 6 deletions docs/Managing-Indexes.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,22 @@ Create a new index or select and index to edit by clicking the index row or the

Fill out the search index form, populating the fields with your custom values.

![Administration search index list](/images/xperience-administration-search-index-edit-form.jpg)
![Administration search index edit form](/images/xperience-administration-search-index-edit-form.jpg)

- Rebuild Hook - for validating a request rebuild of the search index from an external source (ex: API request)
- Indexed Languages - the index will only include content in the selected languages
- Channel Name - the index will only be triggered by web page item creation or modication in the selected website channel
- Index Name - the name of the displayed index.
- Included Reusable Content Types - these are the reusable content types that will be processed by your custom indexing strategy.
If no option is selected, no items will be processed.
- Indexed Languages - the index will only include content in the selected languages.
- Channel Name - the index will only be triggered by web page item creation or modification in the selected website channel.
- Indexing Strategy - the indexing strategy specified in code during dependency registration of a custom indexing strategies.
- If you want the default strategy to appear here, register it explicitly in `IServiceCollection.AddKenticoLucene()` method
- If you want the default strategy to appear here, register it explicitly in `IServiceCollection.AddKenticoLucene()` method.
- Lucene Analyzer - the Lucene analyzer which indexes use to analyze text.
- Rebuild Hook - for validating a request rebuild of the search index from an external source (ex: API request).

Now, configure the web page paths and content types that the search index depends on by clicking the Add New Path button
or clicking an existing path in the table at the top of the index configuration form.

![Administration search index list](/images/xperience-administration-search-index-edit-form-paths-edit.jpg)
![Administration search index edit paths form](/images/xperience-administration-search-index-edit-form-paths-edit.jpg)

- Included Path - can be an exact relative path of a web page item, (ex: `/path/to/my/page`), or a wildcard path (ex: `/parent-path/%`)
- To determine a web page path, select the web page in the website channel page tree, then view the "Current URL" in the Content tab of the web page. The path will be the relative path excluding the domain
Expand Down
10 changes: 5 additions & 5 deletions docs/Scraping-web-page-content.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ public class WebCrawlerService
ex,
$"Tree Path: {page.SystemFields.WebPageItemTreePath}");
}
return "";
return string.Empty;
}

public async Task<string> CrawlPage(string url)
Expand All @@ -75,7 +75,7 @@ public class WebCrawlerService
ex,
$"Url: {url}");
}
return "";
return string.Empty;
}
}
```
Expand Down Expand Up @@ -136,8 +136,8 @@ public class WebScraperHtmlSanitizer
textContent = HTMLHelper.RegexHtmlToTextWhiteSpace.Replace(textContent, " ");
textContent = textContent.Trim();

string title = doc.Head?.QuerySelector("title")?.TextContent ?? "";
string description = doc.Head?.QuerySelector("meta[name='description']")?.GetAttribute("content") ?? "";
string title = doc.Head?.QuerySelector("title")?.TextContent ?? string.Empty;
string description = doc.Head?.QuerySelector("meta[name='description']")?.GetAttribute("content") ?? string.Empty;

return string.Join(
" ",
Expand Down Expand Up @@ -190,7 +190,7 @@ public override async Task<Document?> MapToLuceneDocumentOrNull(IIndexEventItemM
// Setup same as examples in Usage-Guide.md
// ...
string content = "";
string content = string.Empty;

if (item is IndexEventWebPageItemModel webpageItem &&
string.Equals(indexedModel.ContentTypeName, ArticlePage.CONTENT_TYPE_NAME, StringComparison.OrdinalIgnorecase))
Expand Down
8 changes: 4 additions & 4 deletions docs/Search-index-querying.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ To index all existing content, rebuild the index in Xperience's Administration w
```csharp
public class GlobalSearchResultModel
{
public string Title { get; set; } = "";
public string ContentType { get; set; } = "";
public string Url { get; set; } = "";
public string Title { get; set; } = string.Empty;
public string ContentType { get; set; } = string.Empty;
public string Url { get; set; } = string.Empty;

public static List<string> PossibleFacets { get; set; } = new List<string>
{
Expand Down Expand Up @@ -105,7 +105,7 @@ public class SearchService

return new LuceneSearchResultModel<GlobalSearchResultModel>
{
Query = searchText ?? "",
Query = searchText ?? string.Empty,
Page = page,
PageSize = pageSize,
TotalPages = topDocs.TotalHits <= 0 ? 0 : ((topDocs.TotalHits - 1) / pageSize) + 1,
Expand Down
86 changes: 42 additions & 44 deletions examples/DancingGoat/Search/AdvancedSearchIndexingStrategy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ namespace DancingGoat.Search;

public class AdvancedSearchIndexingStrategy : DefaultLuceneIndexingStrategy
{
public static string SORTABLE_TITLE_FIELD_NAME = "SortableTitle";
public const string SORTABLE_TITLE_FIELD_NAME = "SortableTitle";

private readonly IWebPageQueryResultMapper webPageMapper;
private readonly IContentQueryExecutor queryExecutor;
Expand Down Expand Up @@ -43,60 +43,58 @@ WebCrawlerService webCrawler
{
var document = new Document();

string sortableTitle = "";
string title = "";
string content = "";
string sortableTitle = string.Empty;
string title = string.Empty;
string content = string.Empty;

// IIndexEventItemModel could be a reusable content item or a web page item, so we use
// pattern matching to get access to the web page item specific type and fields
if (item is IndexEventWebPageItemModel indexedPage)
if (item is not IndexEventWebPageItemModel indexedPage)
{
if (string.Equals(item.ContentTypeName, ArticlePage.CONTENT_TYPE_NAME, StringComparison.OrdinalIgnoreCase))
return null;
}

if (string.Equals(item.ContentTypeName, ArticlePage.CONTENT_TYPE_NAME, StringComparison.OrdinalIgnoreCase))
{
// The implementation of GetPage<T>() is below
var page = await GetPage<ArticlePage>(
indexedPage.ItemGuid,
indexedPage.WebsiteChannelName,
indexedPage.LanguageName,
ArticlePage.CONTENT_TYPE_NAME);

if (page is null)
{
// The implementation of GetPage<T>() is below
var page = await GetPage<ArticlePage>(
indexedPage.ItemGuid,
indexedPage.WebsiteChannelName,
indexedPage.LanguageName,
ArticlePage.CONTENT_TYPE_NAME);

if (page is null)
{
return null;
}

sortableTitle = title = page?.ArticleTitle ?? "";

string rawContent = await webCrawler.CrawlWebPage(page!);
content = htmlSanitizer.SanitizeHtmlDocument(rawContent);
return null;
}
else if (string.Equals(item.ContentTypeName, HomePage.CONTENT_TYPE_NAME, StringComparison.OrdinalIgnoreCase))

sortableTitle = title = page?.ArticleTitle ?? string.Empty;

string rawContent = await webCrawler.CrawlWebPage(page!);
content = htmlSanitizer.SanitizeHtmlDocument(rawContent);
}
else if (string.Equals(item.ContentTypeName, HomePage.CONTENT_TYPE_NAME, StringComparison.OrdinalIgnoreCase))
{
var page = await GetPage<HomePage>(
indexedPage.ItemGuid,
indexedPage.WebsiteChannelName,
indexedPage.LanguageName,
HomePage.CONTENT_TYPE_NAME);

if (page is null)
{
var page = await GetPage<HomePage>(
indexedPage.ItemGuid,
indexedPage.WebsiteChannelName,
indexedPage.LanguageName,
HomePage.CONTENT_TYPE_NAME);

if (page is null)
{
return null;
}

if (page.HomePageBanner.IsNullOrEmpty())
{
return null;
}

sortableTitle = title = page!.HomePageBanner.First().BannerText;

string rawContent = await webCrawler.CrawlWebPage(page!);
content = htmlSanitizer.SanitizeHtmlDocument(rawContent);
return null;
}
else

if (page.HomePageBanner.IsNullOrEmpty())
{
return null;
}

sortableTitle = title = page!.HomePageBanner.First().BannerText;

string rawContent = await webCrawler.CrawlWebPage(page!);
content = htmlSanitizer.SanitizeHtmlDocument(rawContent);
}
else
{
Expand Down
6 changes: 3 additions & 3 deletions examples/DancingGoat/Search/DancingGoatSearchResultModel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

public class DancingGoatSearchResultModel
{
public string Title { get; set; } = "";
public string ContentType { get; set; } = "";
public string Url { get; set; } = "";
public string Title { get; set; } = string.Empty;
public string ContentType { get; set; } = string.Empty;
public string Url { get; set; } = string.Empty;
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ public static IServiceCollection AddKenticoDancingGoatLuceneServices(this IServi
{
builder.RegisterStrategy<AdvancedSearchIndexingStrategy>("DancingGoatExampleStrategy");
builder.RegisterStrategy<SimpleSearchIndexingStrategy>("DancingGoatMinimalExampleStrategy");
builder.RegisterStrategy<ReusableContentItemsIndexingStrategy>(nameof(ReusableContentItemsIndexingStrategy));
builder.RegisterAnalyzer<CzechAnalyzer>("Czech analyzer");
});

Expand Down
113 changes: 113 additions & 0 deletions examples/DancingGoat/Search/ReusableContentItemsIndexingStrategy.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
using CMS.ContentEngine;
using CMS.Websites;

using DancingGoat.Models;
using DancingGoat.Search.Services;

using Kentico.Xperience.Lucene.Core;
using Kentico.Xperience.Lucene.Core.Indexing;

using Lucene.Net.Documents;
using Lucene.Net.Facet;

namespace DancingGoat.Search;

public class ReusableContentItemsIndexingStrategy : DefaultLuceneIndexingStrategy
{
public const string SORTABLE_TITLE_FIELD_NAME = "SortableTitle";

private readonly IWebPageQueryResultMapper webPageMapper;
private readonly IContentQueryExecutor queryExecutor;
private readonly IWebPageUrlRetriever urlRetriever;
private readonly WebScraperHtmlSanitizer htmlSanitizer;
private readonly WebCrawlerService webCrawler;

public const string FACET_DIMENSION = "ContentType";
public const string INDEXED_WEBSITECHANNEL_NAME = "DancingGoatPages";
public const string CRAWLER_CONTENT_FIELD_NAME = "Content";

public ReusableContentItemsIndexingStrategy(
IWebPageQueryResultMapper webPageMapper,
IContentQueryExecutor queryExecutor,
IWebPageUrlRetriever urlRetriever,
WebScraperHtmlSanitizer htmlSanitizer,
WebCrawlerService webCrawler
)
{
this.urlRetriever = urlRetriever;
this.webPageMapper = webPageMapper;
this.queryExecutor = queryExecutor;
this.htmlSanitizer = htmlSanitizer;
this.webCrawler = webCrawler;
}

public override async Task<Document?> MapToLuceneDocumentOrNull(IIndexEventItemModel item)
{
var document = new Document();

string sortableTitle = string.Empty;
string title = string.Empty;
string content = string.Empty;

// IIndexEventItemModel could be a reusable content item or a web page item, so we use
// pattern matching to get access to the web page item specific type and fields
if (item is not IndexEventReusableItemModel indexedItem)
{
return null;
}

if (!string.Equals(item.ContentTypeName, Banner.CONTENT_TYPE_NAME, StringComparison.OrdinalIgnoreCase))
{
return null;
}

var query = new ContentItemQueryBuilder()
.ForContentType(HomePage.CONTENT_TYPE_NAME,
config =>
config
.WithLinkedItems(4)
// Because the changedItem is a reusable content item, we don't have a website channel name to use here
// so we use a hardcoded channel name.
.ForWebsite(INDEXED_WEBSITECHANNEL_NAME)
// Retrieves all HomePages that link to the Banner through the HomePage.HomePageBanner field
.Linking(nameof(HomePage.HomePageBanner), new[] { indexedItem.ItemID }))
.InLanguage(indexedItem.LanguageName);

var associatedWebPageItem = (await queryExecutor.GetWebPageResult(query, webPageMapper.Map<HomePage>)).First();
string url = string.Empty;
try
{
url = (await urlRetriever.Retrieve(associatedWebPageItem.SystemFields.WebPageItemTreePath,
INDEXED_WEBSITECHANNEL_NAME, indexedItem.LanguageName)).RelativePath;
}
catch (Exception)
{
// Retrieve can throw an exception when processing a page update LuceneQueueItem
// and the page was deleted before the update task has processed. In this case, return no item.
return null;
}

sortableTitle = title = associatedWebPageItem!.HomePageBanner.First().BannerText;
string rawContent = await webCrawler.CrawlWebPage(associatedWebPageItem!);
content = htmlSanitizer.SanitizeHtmlDocument(rawContent);

//If the indexed item is a reusable content item, we need to set the url manually.
document.Add(new StringField(BaseDocumentProperties.URL, url, Field.Store.YES));
document.Add(new TextField(nameof(DancingGoatSearchResultModel.Title), title, Field.Store.YES));
document.Add(new StringField(SORTABLE_TITLE_FIELD_NAME, sortableTitle, Field.Store.YES));
document.Add(new TextField(CRAWLER_CONTENT_FIELD_NAME, content, Field.Store.NO));

return document;
}

public override FacetsConfig FacetsConfigFactory()
{
var facetConfig = new FacetsConfig();

facetConfig.SetMultiValued(FACET_DIMENSION, true);

return facetConfig;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ public LuceneSearchResultModel<DancingGoatSearchResultModel> GlobalSearch(
return new LuceneSearchResultModel<DancingGoatSearchResultModel>
{
Query = searchText ?? "",
Query = searchText ?? string.Empty,
Page = page,
PageSize = pageSize,
TotalPages = topDocs.TotalHits <= 0 ? 0 : ((topDocs.TotalHits - 1) / pageSize) + 1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ public LuceneSearchResultModel<DancingGoatSearchResultModel> GlobalSearch(
return new LuceneSearchResultModel<DancingGoatSearchResultModel>
{
Query = searchText ?? "",
Query = searchText ?? string.Empty,
Page = page,
PageSize = pageSize,
TotalPages = topDocs.TotalHits <= 0 ? 0 : ((topDocs.TotalHits - 1) / pageSize) + 1,
Expand Down
Loading

0 comments on commit 211efdb

Please sign in to comment.