Feat/index reusable content (#69)

* fix(Lucene.Core): upgrade to new version of Kentico, webpage events which are listened to by Lucene changed from archive to unpublish * feat(Lucene.Core): Mapping reusable items * feat(docs): add reusable content types * fix(Lucene.Admin): wrong placeholder in content type selection * fix(Lucene.Admin): reusable content provider comment * feat(docs): reusable content types images * feat(Lucene.Core): indexing reusable content example strategy * feat(docs): indexing reusable content * fix(docs): remove invalid images * Add files via upload * mrak .jpg as binary in .gitattributes * feat(Lucene.Core): indexing reusable content example * feat(Lucene.Core): indexing reusable content do not execute rebuild query if no content type is selected * feat(Lucene.Admin): update packages * fix(docs): typo * fix(DancingGoat): refactor string initialization with string.empty * refactor(Lucene.Core, Admin): refactor string assignments * refactor(Lucene.Core): remove specific info providers * refactor(Lucene.Core): use iinfoproviders * refactor(docs): change "" to string.Empty * refactor(Lucene.Core): remove redundand info provider definitions * refactor(DancingGoat): conditional blocks --------- Co-authored-by: Miloslav Hlaváč <[email protected]>
Kentico · Sep 3, 2024 · 211efdb · 211efdb
1 parent 2e1ae4b
commit 211efdb
Show file tree

Hide file tree

Showing 43 changed files with 3,976 additions and 3,428 deletions.
diff --git a/docs/Custom-index-strategy.md b/docs/Custom-index-strategy.md
@@ -31,8 +31,8 @@ public class ExampleSearchIndexingStrategy : DefaultLuceneIndexingStrategy
     {
         var document = new Document();
 
-        string sortableTitle = "";
-        string title = "";
+        string sortableTitle = string.Empty;
+        string title = string.Empty;
 
         // IIndexEventItemModel could be a reusable content item or a web page item, so we use
         // pattern matching to get access to the web page item specific type and fields
@@ -60,7 +60,7 @@ public class ExampleSearchIndexingStrategy : DefaultLuceneIndexingStrategy
 
             var article = page.ArticlePageArticle.FirstOrDefault();
 
-            sortableTitle = title = article?.ArticleTitle ?? "";
+            sortableTitle = title = article?.ArticleTitle ?? string.Empty;
         }
 
         document.Add(new TextField(nameof(GlobalSearchResultModel.Title), title, Field.Store.YES));
@@ -83,6 +83,7 @@ public static class BaseDocumentProperties
     public const string ID = "ID";
     public const string CONTENT_TYPE_NAME = "ContentTypeName";
     // ...
+    // This field is only added to the document if the indexed item is a web page.
     public const string URL = "Url";
 }
 ```
@@ -135,9 +136,9 @@ public class ExampleSearchIndexingStrategy : DefaultLuceneIndexingStrategy
     {
         var document = new Document();
 
-        string sortableTitle = "";
-        string title = "";
-        string contentType = "";
+        string sortableTitle = string.Empty;
+        string title = string.Empty;
+        string contentType = string.Empty;
 
         if (item is IndexEventWebPageItemModel webpageItem &&
             string.Equals(indexedModel.ContentTypeName, ArticlePage.CONTENT_TYPE_NAME, StringComparison.OrdinalIgnorecase))

diff --git a/docs/Managing-Indexes.md b/docs/Managing-Indexes.md
@@ -12,19 +12,22 @@ Create a new index or select and index to edit by clicking the index row or the
 
 Fill out the search index form, populating the fields with your custom values.
 
-![Administration search index list](/images/xperience-administration-search-index-edit-form.jpg)
+![Administration search index edit form](/images/xperience-administration-search-index-edit-form.jpg)
 
-- Rebuild Hook - for validating a request rebuild of the search index from an external source (ex: API request)
-- Indexed Languages - the index will only include content in the selected languages
-- Channel Name - the index will only be triggered by web page item creation or modication in the selected website channel
+- Index Name - the name of the displayed index.
+- Included Reusable Content Types - these are the reusable content types that will be processed by your custom indexing strategy.
+If no option is selected, no items will be processed.
+- Indexed Languages - the index will only include content in the selected languages.
+- Channel Name - the index will only be triggered by web page item creation or modification in the selected website channel.
 - Indexing Strategy - the indexing strategy specified in code during dependency registration of a custom indexing strategies.
-  - If you want the default strategy to appear here, register it explicitly in `IServiceCollection.AddKenticoLucene()` method
+  - If you want the default strategy to appear here, register it explicitly in `IServiceCollection.AddKenticoLucene()` method.
 - Lucene Analyzer - the Lucene analyzer which indexes use to analyze text.
+- Rebuild Hook - for validating a request rebuild of the search index from an external source (ex: API request).
 
 Now, configure the web page paths and content types that the search index depends on by clicking the Add New Path button
 or clicking an existing path in the table at the top of the index configuration form.
 
-![Administration search index list](/images/xperience-administration-search-index-edit-form-paths-edit.jpg)
+![Administration search index edit paths form](/images/xperience-administration-search-index-edit-form-paths-edit.jpg)
 
 - Included Path - can be an exact relative path of a web page item, (ex: `/path/to/my/page`), or a wildcard path (ex: `/parent-path/%`)
   - To determine a web page path, select the web page in the website channel page tree, then view the "Current URL" in the Content tab of the web page. The path will be the relative path excluding the domain

diff --git a/docs/Scraping-web-page-content.md b/docs/Scraping-web-page-content.md
@@ -57,7 +57,7 @@ public class WebCrawlerService
                 ex,
                 $"Tree Path: {page.SystemFields.WebPageItemTreePath}");
         }
-        return "";
+        return string.Empty;
     }
 
     public async Task<string> CrawlPage(string url)
@@ -75,7 +75,7 @@ public class WebCrawlerService
                 ex,
                 $"Url: {url}");
         }
-        return "";
+        return string.Empty;
     }
 }
 ```
@@ -136,8 +136,8 @@ public class WebScraperHtmlSanitizer
         textContent = HTMLHelper.RegexHtmlToTextWhiteSpace.Replace(textContent, " ");
         textContent = textContent.Trim();
 
-        string title = doc.Head?.QuerySelector("title")?.TextContent ?? "";
-        string description = doc.Head?.QuerySelector("meta[name='description']")?.GetAttribute("content") ?? "";
+        string title = doc.Head?.QuerySelector("title")?.TextContent ?? string.Empty;
+        string description = doc.Head?.QuerySelector("meta[name='description']")?.GetAttribute("content") ?? string.Empty;
 
         return string.Join(
             " ",
@@ -190,7 +190,7 @@ public override async Task<Document?> MapToLuceneDocumentOrNull(IIndexEventItemM
     // Setup same as examples in Usage-Guide.md
     // ...
 
-    string content = "";
+    string content = string.Empty;
 
     if (item is IndexEventWebPageItemModel webpageItem &&
         string.Equals(indexedModel.ContentTypeName, ArticlePage.CONTENT_TYPE_NAME, StringComparison.OrdinalIgnorecase))

diff --git a/docs/Search-index-querying.md b/docs/Search-index-querying.md
@@ -11,9 +11,9 @@ To index all existing content, rebuild the index in Xperience's Administration w
 ```csharp
 public class GlobalSearchResultModel
 {
-    public string Title { get; set; } = "";
-    public string ContentType { get; set; } = "";
-    public string Url { get; set; } = "";
+    public string Title { get; set; } = string.Empty;
+    public string ContentType { get; set; } = string.Empty;
+    public string Url { get; set; } = string.Empty;
 
     public static List<string> PossibleFacets { get; set; } = new List<string>
     {
@@ -105,7 +105,7 @@ public class SearchService
 
                 return new LuceneSearchResultModel<GlobalSearchResultModel>
                 {
-                    Query = searchText ?? "",
+                    Query = searchText ?? string.Empty,
                     Page = page,
                     PageSize = pageSize,
                     TotalPages = topDocs.TotalHits <= 0 ? 0 : ((topDocs.TotalHits - 1) / pageSize) + 1,

diff --git a/examples/DancingGoat/Search/AdvancedSearchIndexingStrategy.cs b/examples/DancingGoat/Search/AdvancedSearchIndexingStrategy.cs
@@ -15,7 +15,7 @@ namespace DancingGoat.Search;
 
 public class AdvancedSearchIndexingStrategy : DefaultLuceneIndexingStrategy
 {
-    public static string SORTABLE_TITLE_FIELD_NAME = "SortableTitle";
+    public const string SORTABLE_TITLE_FIELD_NAME = "SortableTitle";
 
     private readonly IWebPageQueryResultMapper webPageMapper;
     private readonly IContentQueryExecutor queryExecutor;
@@ -43,60 +43,58 @@ WebCrawlerService webCrawler
     {
         var document = new Document();
 
-        string sortableTitle = "";
-        string title = "";
-        string content = "";
+        string sortableTitle = string.Empty;
+        string title = string.Empty;
+        string content = string.Empty;
 
         // IIndexEventItemModel could be a reusable content item or a web page item, so we use
         // pattern matching to get access to the web page item specific type and fields
-        if (item is IndexEventWebPageItemModel indexedPage)
+        if (item is not IndexEventWebPageItemModel indexedPage)
         {
-            if (string.Equals(item.ContentTypeName, ArticlePage.CONTENT_TYPE_NAME, StringComparison.OrdinalIgnoreCase))
+            return null;
+        }
+
+        if (string.Equals(item.ContentTypeName, ArticlePage.CONTENT_TYPE_NAME, StringComparison.OrdinalIgnoreCase))
+        {
+            // The implementation of GetPage<T>() is below
+            var page = await GetPage<ArticlePage>(
+                indexedPage.ItemGuid,
+                indexedPage.WebsiteChannelName,
+                indexedPage.LanguageName,
+                ArticlePage.CONTENT_TYPE_NAME);
+
+            if (page is null)
             {
-                // The implementation of GetPage<T>() is below
-                var page = await GetPage<ArticlePage>(
-                    indexedPage.ItemGuid,
-                    indexedPage.WebsiteChannelName,
-                    indexedPage.LanguageName,
-                    ArticlePage.CONTENT_TYPE_NAME);
-
-                if (page is null)
-                {
-                    return null;
-                }
-
-                sortableTitle = title = page?.ArticleTitle ?? "";
-
-                string rawContent = await webCrawler.CrawlWebPage(page!);
-                content = htmlSanitizer.SanitizeHtmlDocument(rawContent);
+                return null;
             }
-            else if (string.Equals(item.ContentTypeName, HomePage.CONTENT_TYPE_NAME, StringComparison.OrdinalIgnoreCase))
+
+            sortableTitle = title = page?.ArticleTitle ?? string.Empty;
+
+            string rawContent = await webCrawler.CrawlWebPage(page!);
+            content = htmlSanitizer.SanitizeHtmlDocument(rawContent);
+        }
+        else if (string.Equals(item.ContentTypeName, HomePage.CONTENT_TYPE_NAME, StringComparison.OrdinalIgnoreCase))
+        {
+            var page = await GetPage<HomePage>(
+                indexedPage.ItemGuid,
+                indexedPage.WebsiteChannelName,
+                indexedPage.LanguageName,
+                HomePage.CONTENT_TYPE_NAME);
+
+            if (page is null)
             {
-                var page = await GetPage<HomePage>(
-                    indexedPage.ItemGuid,
-                    indexedPage.WebsiteChannelName,
-                    indexedPage.LanguageName,
-                    HomePage.CONTENT_TYPE_NAME);
-
-                if (page is null)
-                {
-                    return null;
-                }
-
-                if (page.HomePageBanner.IsNullOrEmpty())
-                {
-                    return null;
-                }
-
-                sortableTitle = title = page!.HomePageBanner.First().BannerText;
-
-                string rawContent = await webCrawler.CrawlWebPage(page!);
-                content = htmlSanitizer.SanitizeHtmlDocument(rawContent);
+                return null;
             }
-            else
+
+            if (page.HomePageBanner.IsNullOrEmpty())
             {
                 return null;
             }
+
+            sortableTitle = title = page!.HomePageBanner.First().BannerText;
+
+            string rawContent = await webCrawler.CrawlWebPage(page!);
+            content = htmlSanitizer.SanitizeHtmlDocument(rawContent);
         }
         else
         {

diff --git a/examples/DancingGoat/Search/DancingGoatSearchResultModel.cs b/examples/DancingGoat/Search/DancingGoatSearchResultModel.cs
@@ -2,7 +2,7 @@
 
 public class DancingGoatSearchResultModel
 {
-    public string Title { get; set; } = "";
-    public string ContentType { get; set; } = "";
-    public string Url { get; set; } = "";
+    public string Title { get; set; } = string.Empty;
+    public string ContentType { get; set; } = string.Empty;
+    public string Url { get; set; } = string.Empty;
 }
diff --git a/examples/DancingGoat/Search/DancingGoatSearchStartupExtensions.cs b/examples/DancingGoat/Search/DancingGoatSearchStartupExtensions.cs
@@ -12,6 +12,7 @@ public static IServiceCollection AddKenticoDancingGoatLuceneServices(this IServi
         {
             builder.RegisterStrategy<AdvancedSearchIndexingStrategy>("DancingGoatExampleStrategy");
             builder.RegisterStrategy<SimpleSearchIndexingStrategy>("DancingGoatMinimalExampleStrategy");
+            builder.RegisterStrategy<ReusableContentItemsIndexingStrategy>(nameof(ReusableContentItemsIndexingStrategy));
             builder.RegisterAnalyzer<CzechAnalyzer>("Czech analyzer");
         });
 

diff --git a/examples/DancingGoat/Search/ReusableContentItemsIndexingStrategy.cs b/examples/DancingGoat/Search/ReusableContentItemsIndexingStrategy.cs
@@ -0,0 +1,113 @@
+using CMS.ContentEngine;
+using CMS.Websites;
+
+using DancingGoat.Models;
+using DancingGoat.Search.Services;
+
+using Kentico.Xperience.Lucene.Core;
+using Kentico.Xperience.Lucene.Core.Indexing;
+
+using Lucene.Net.Documents;
+using Lucene.Net.Facet;
+
+namespace DancingGoat.Search;
+
+public class ReusableContentItemsIndexingStrategy : DefaultLuceneIndexingStrategy
+{
+    public const string SORTABLE_TITLE_FIELD_NAME = "SortableTitle";
+
+    private readonly IWebPageQueryResultMapper webPageMapper;
+    private readonly IContentQueryExecutor queryExecutor;
+    private readonly IWebPageUrlRetriever urlRetriever;
+    private readonly WebScraperHtmlSanitizer htmlSanitizer;
+    private readonly WebCrawlerService webCrawler;
+
+    public const string FACET_DIMENSION = "ContentType";
+    public const string INDEXED_WEBSITECHANNEL_NAME = "DancingGoatPages";
+    public const string CRAWLER_CONTENT_FIELD_NAME = "Content";
+
+    public ReusableContentItemsIndexingStrategy(
+        IWebPageQueryResultMapper webPageMapper,
+        IContentQueryExecutor queryExecutor,
+        IWebPageUrlRetriever urlRetriever,
+        WebScraperHtmlSanitizer htmlSanitizer,
+        WebCrawlerService webCrawler
+    )
+    {
+        this.urlRetriever = urlRetriever;
+        this.webPageMapper = webPageMapper;
+        this.queryExecutor = queryExecutor;
+        this.htmlSanitizer = htmlSanitizer;
+        this.webCrawler = webCrawler;
+    }
+
+    public override async Task<Document?> MapToLuceneDocumentOrNull(IIndexEventItemModel item)
+    {
+        var document = new Document();
+
+        string sortableTitle = string.Empty;
+        string title = string.Empty;
+        string content = string.Empty;
+
+        // IIndexEventItemModel could be a reusable content item or a web page item, so we use
+        // pattern matching to get access to the web page item specific type and fields
+        if (item is not IndexEventReusableItemModel indexedItem)
+        {
+            return null;
+        }
+
+        if (!string.Equals(item.ContentTypeName, Banner.CONTENT_TYPE_NAME, StringComparison.OrdinalIgnoreCase))
+        {
+            return null;
+        }
+
+        var query = new ContentItemQueryBuilder()
+        .ForContentType(HomePage.CONTENT_TYPE_NAME,
+            config =>
+                config
+                    .WithLinkedItems(4)
+
+                    // Because the changedItem is a reusable content item, we don't have a website channel name to use here
+                    // so we use a hardcoded channel name.
+                    .ForWebsite(INDEXED_WEBSITECHANNEL_NAME)
+
+                    // Retrieves all HomePages that link to the Banner through the HomePage.HomePageBanner field
+                    .Linking(nameof(HomePage.HomePageBanner), new[] { indexedItem.ItemID }))
+        .InLanguage(indexedItem.LanguageName);
+
+        var associatedWebPageItem = (await queryExecutor.GetWebPageResult(query, webPageMapper.Map<HomePage>)).First();
+        string url = string.Empty;
+        try
+        {
+            url = (await urlRetriever.Retrieve(associatedWebPageItem.SystemFields.WebPageItemTreePath,
+                INDEXED_WEBSITECHANNEL_NAME, indexedItem.LanguageName)).RelativePath;
+        }
+        catch (Exception)
+        {
+            // Retrieve can throw an exception when processing a page update LuceneQueueItem
+            // and the page was deleted before the update task has processed. In this case, return no item.
+            return null;
+        }
+
+        sortableTitle = title = associatedWebPageItem!.HomePageBanner.First().BannerText;
+        string rawContent = await webCrawler.CrawlWebPage(associatedWebPageItem!);
+        content = htmlSanitizer.SanitizeHtmlDocument(rawContent);
+
+        //If the indexed item is a reusable content item, we need to set the url manually.
+        document.Add(new StringField(BaseDocumentProperties.URL, url, Field.Store.YES));
+        document.Add(new TextField(nameof(DancingGoatSearchResultModel.Title), title, Field.Store.YES));
+        document.Add(new StringField(SORTABLE_TITLE_FIELD_NAME, sortableTitle, Field.Store.YES));
+        document.Add(new TextField(CRAWLER_CONTENT_FIELD_NAME, content, Field.Store.NO));
+
+        return document;
+    }
+
+    public override FacetsConfig FacetsConfigFactory()
+    {
+        var facetConfig = new FacetsConfig();
+
+        facetConfig.SetMultiValued(FACET_DIMENSION, true);
+
+        return facetConfig;
+    }
+}
diff --git a/examples/DancingGoat/Search/Services/AdvancedSearchService.cs b/examples/DancingGoat/Search/Services/AdvancedSearchService.cs
@@ -77,7 +77,7 @@ public LuceneSearchResultModel<DancingGoatSearchResultModel> GlobalSearch(
 
                return new LuceneSearchResultModel<DancingGoatSearchResultModel>
                {
-                   Query = searchText ?? "",
+                   Query = searchText ?? string.Empty,
                    Page = page,
                    PageSize = pageSize,
                    TotalPages = topDocs.TotalHits <= 0 ? 0 : ((topDocs.TotalHits - 1) / pageSize) + 1,

diff --git a/examples/DancingGoat/Search/Services/SimpleSearchService.cs b/examples/DancingGoat/Search/Services/SimpleSearchService.cs
@@ -47,7 +47,7 @@ public LuceneSearchResultModel<DancingGoatSearchResultModel> GlobalSearch(
 
                return new LuceneSearchResultModel<DancingGoatSearchResultModel>
                {
-                   Query = searchText ?? "",
+                   Query = searchText ?? string.Empty,
                    Page = page,
                    PageSize = pageSize,
                    TotalPages = topDocs.TotalHits <= 0 ? 0 : ((topDocs.TotalHits - 1) / pageSize) + 1,