From 2e4a6ed9294351fdd76420564f8faae94bc6cbce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miloslav=20Hlav=C3=A1=C4=8D?= Date: Thu, 3 Aug 2023 13:09:10 +0200 Subject: [PATCH] Add sample implementation for pages crawler --- .../Program.cs | 10 +- .../Search/CrawlerSearchController.cs | 19 ++++ .../Search/DancingGoatCrawlerSearchModel.cs | 49 +++++++++ .../DancingGoatCrawlerSearchResultModel.cs | 9 ++ .../Search/DancingGoatCrawlerSearchService.cs | 84 +++++++++++++++ .../Search/WebCrawlerService.cs | 37 +++++++ .../Search/WebScraperHtmlSanitizer.cs | 12 +++ .../Views/CrawlerSearch/Index.cshtml | 100 ++++++++++++++++++ 8 files changed, 319 insertions(+), 1 deletion(-) create mode 100644 src/Kentico.Xperience.Lucene.Sample/Search/CrawlerSearchController.cs create mode 100644 src/Kentico.Xperience.Lucene.Sample/Search/DancingGoatCrawlerSearchModel.cs create mode 100644 src/Kentico.Xperience.Lucene.Sample/Search/DancingGoatCrawlerSearchResultModel.cs create mode 100644 src/Kentico.Xperience.Lucene.Sample/Search/DancingGoatCrawlerSearchService.cs create mode 100644 src/Kentico.Xperience.Lucene.Sample/Search/WebCrawlerService.cs create mode 100644 src/Kentico.Xperience.Lucene.Sample/Views/CrawlerSearch/Index.cshtml diff --git a/src/Kentico.Xperience.Lucene.Sample/Program.cs b/src/Kentico.Xperience.Lucene.Sample/Program.cs index 2194efa..a2814b8 100644 --- a/src/Kentico.Xperience.Lucene.Sample/Program.cs +++ b/src/Kentico.Xperience.Lucene.Sample/Program.cs @@ -24,7 +24,7 @@ The constraint ensures that broken URLs lead to a "404 page not found" page and are not handled by a controller dedicated to the component or to a page handled by the content tree-based router (which would lead to an exception). */ -const string CONSTRAINT_FOR_NON_ROUTER_PAGE_CONTROLLERS = "Account|Consent|Subscription|Coffees|Search"; +const string CONSTRAINT_FOR_NON_ROUTER_PAGE_CONTROLLERS = "Account|Consent|Subscription|Coffees|Search|CrawlerSearch"; var builder = WebApplication.CreateBuilder(args); @@ -95,9 +95,17 @@ The constraint ensures that broken URLs lead to a "404 page not found" page and DancingGoatSearchModel.IndexName, indexPath: null, new DancingGoatLuceneIndexingStrategy()), + new LuceneIndex( + typeof(DancingGoatCrawlerSearchModel), + new StandardAnalyzer(Lucene.Net.Util.LuceneVersion.LUCENE_48), + DancingGoatCrawlerSearchModel.IndexName, + indexPath: null, + new DancingGoatCrawlerLuceneIndexingStrategy()), }); builder.Services.AddSingleton(); builder.Services.AddSingleton(); +builder.Services.AddHttpClient(); +builder.Services.AddSingleton(); ConfigureMembershipServices(builder.Services); ConfigurePageBuilderFilters(); diff --git a/src/Kentico.Xperience.Lucene.Sample/Search/CrawlerSearchController.cs b/src/Kentico.Xperience.Lucene.Sample/Search/CrawlerSearchController.cs new file mode 100644 index 0000000..b4a07b3 --- /dev/null +++ b/src/Kentico.Xperience.Lucene.Sample/Search/CrawlerSearchController.cs @@ -0,0 +1,19 @@ +using Microsoft.AspNetCore.Mvc; + +namespace DancingGoat.Search; + +public class CrawlerSearchController : Controller +{ + private readonly DancingGoatCrawlerSearchService searchService; + + public CrawlerSearchController(DancingGoatCrawlerSearchService searchService) => this.searchService = searchService; + + [HttpGet] + public IActionResult Index(string query, int pageSize = 10, int page = 1) + { + var results = searchService.Search(query, pageSize, page); + + return View(results); + } +} + diff --git a/src/Kentico.Xperience.Lucene.Sample/Search/DancingGoatCrawlerSearchModel.cs b/src/Kentico.Xperience.Lucene.Sample/Search/DancingGoatCrawlerSearchModel.cs new file mode 100644 index 0000000..74997ff --- /dev/null +++ b/src/Kentico.Xperience.Lucene.Sample/Search/DancingGoatCrawlerSearchModel.cs @@ -0,0 +1,49 @@ +using CMS.Core; +using CMS.DocumentEngine; +using CMS.DocumentEngine.Types.DancingGoatCore; +using Kentico.Xperience.Lucene.Attributes; +using Kentico.Xperience.Lucene.Models; +using Kentico.Xperience.Lucene.Services.Implementations; +using Lucene.Net.Documents; + +namespace DancingGoat.Search; + +[IncludedPath("/%", ContentTypes = new string[] { + AboutUs.CLASS_NAME, + Article.CLASS_NAME, + CafeSection.CLASS_NAME, + Coffee.CLASS_NAME, + Contacts.CLASS_NAME, + Home.CLASS_NAME, +})] +public class DancingGoatCrawlerSearchModel : LuceneSearchModel +{ + public const string IndexName = "DancingGoatCrawler"; + + [TextField(true)] + //[ Source(new string[] { nameof(NewsPage.Title), nameof(TreeNode.DocumentName) })] + [Source(new string[] { nameof(TreeNode.DocumentName) })] + public string Title { get; set; } + + [TextField(false)] + public string CrawlerContent { get; set; } + +} + +public class DancingGoatCrawlerLuceneIndexingStrategy : DefaultLuceneIndexingStrategy +{ + public override async Task OnIndexingProperty(TreeNode node, string propertyName, string usedColumn, object foundValue) + { + object result = foundValue; + if (propertyName == nameof(DancingGoatCrawlerSearchModel.CrawlerContent)) + { + var htmlSanitizer = Service.Resolve(); + var webCrawler = Service.Resolve(); + + string content = await webCrawler.CrawlNode(node); + result = htmlSanitizer.SanitizeHtmlDocument(content); + } + + return result; + } +} diff --git a/src/Kentico.Xperience.Lucene.Sample/Search/DancingGoatCrawlerSearchResultModel.cs b/src/Kentico.Xperience.Lucene.Sample/Search/DancingGoatCrawlerSearchResultModel.cs new file mode 100644 index 0000000..446f675 --- /dev/null +++ b/src/Kentico.Xperience.Lucene.Sample/Search/DancingGoatCrawlerSearchResultModel.cs @@ -0,0 +1,9 @@ + +namespace DancingGoat.Search; + +public class DancingGoatCrawlerSearchResultModel +{ + public string Title { get; set; } + public string ContentType { get; set; } + public string Url { get; set; } +} diff --git a/src/Kentico.Xperience.Lucene.Sample/Search/DancingGoatCrawlerSearchService.cs b/src/Kentico.Xperience.Lucene.Sample/Search/DancingGoatCrawlerSearchService.cs new file mode 100644 index 0000000..6f33d78 --- /dev/null +++ b/src/Kentico.Xperience.Lucene.Sample/Search/DancingGoatCrawlerSearchService.cs @@ -0,0 +1,84 @@ +using Kentico.Xperience.Lucene; +using Kentico.Xperience.Lucene.Models; +using Kentico.Xperience.Lucene.Services; +using Lucene.Net.Documents; +using Lucene.Net.Search; +using Lucene.Net.Util; + +namespace DancingGoat.Search; + +public class DancingGoatCrawlerSearchService +{ + private const int PHRASE_SLOP = 3; + private const int MAX_RESULTS = 1000; + + private readonly ILuceneIndexService luceneIndexService; + + public DancingGoatCrawlerSearchService(ILuceneIndexService luceneIndexService) => this.luceneIndexService = luceneIndexService; + + public LuceneSearchResultModel Search(string searchText, int pageSize = 20, int page = 1) + { + var index = IndexStore.Instance.GetIndex(DancingGoatCrawlerSearchModel.IndexName) ?? throw new Exception($"Index {DancingGoatSearchModel.IndexName} was not found!!!"); + pageSize = Math.Max(1, pageSize); + page = Math.Max(1, page); + int offset = pageSize * (page - 1); + int limit = pageSize; + + var queryBuilder = new QueryBuilder(index.Analyzer); + + var query = string.IsNullOrWhiteSpace(searchText) + ? new MatchAllDocsQuery() + : GetTermQuery(queryBuilder, searchText); + + var result = luceneIndexService.UseSearcher( + index, + (searcher) => + { + var topDocs = searcher.Search(query, MAX_RESULTS); + + return new LuceneSearchResultModel() + { + Query = searchText ?? "", + Page = page, + PageSize = pageSize, + TotalPages = topDocs.TotalHits <= 0 ? 0 : ((topDocs.TotalHits - 1) / pageSize) + 1, + TotalHits = topDocs.TotalHits, + Hits = topDocs.ScoreDocs + .Skip(offset) + .Take(limit) + .Select(d => MapToResultItem(searcher.Doc(d.Doc))) + .ToList(), + }; + } + ); + + return result; + } + + private static Query GetTermQuery(QueryBuilder queryBuilder, string searchText) + { + var titlePhrase = queryBuilder.CreatePhraseQuery(nameof(DancingGoatCrawlerSearchModel.Title), searchText, PHRASE_SLOP); + titlePhrase.Boost = 5; + var contentPhrase = queryBuilder.CreatePhraseQuery(nameof(DancingGoatCrawlerSearchModel.CrawlerContent), searchText, PHRASE_SLOP); + contentPhrase.Boost = 1; + var titleShould = queryBuilder.CreateBooleanQuery(nameof(DancingGoatCrawlerSearchModel.CrawlerContent), searchText, Occur.SHOULD); + titleShould.Boost = 0.5f; + var contentShould = queryBuilder.CreateBooleanQuery(nameof(DancingGoatCrawlerSearchModel.CrawlerContent), searchText, Occur.SHOULD); + contentShould.Boost = 0.1f; + + return new BooleanQuery + { + { titlePhrase, Occur.SHOULD }, + { contentPhrase, Occur.SHOULD }, + { titleShould, Occur.SHOULD }, + { contentShould, Occur.SHOULD }, + }; + } + + private DancingGoatCrawlerSearchResultModel MapToResultItem(Document doc) => new() + { + Title = doc.Get(nameof(DancingGoatCrawlerSearchModel.Title)), + Url = doc.Get(nameof(DancingGoatCrawlerSearchModel.Url)), + ContentType = doc.Get(nameof(DancingGoatCrawlerSearchModel.ClassName)), + }; +} diff --git a/src/Kentico.Xperience.Lucene.Sample/Search/WebCrawlerService.cs b/src/Kentico.Xperience.Lucene.Sample/Search/WebCrawlerService.cs new file mode 100644 index 0000000..78cc297 --- /dev/null +++ b/src/Kentico.Xperience.Lucene.Sample/Search/WebCrawlerService.cs @@ -0,0 +1,37 @@ +using CMS.DocumentEngine; +using Kentico.Content.Web.Mvc; +using Microsoft.Net.Http.Headers; + +namespace DancingGoat.Search; + +public class WebCrawlerService +{ + private readonly HttpClient httpClient; + private readonly IPageUrlRetriever urlRetriever; + + [System.Diagnostics.CodeAnalysis.SuppressMessage("Major Code Smell", "S125:Sections of code should not be commented out", Justification = "Comments contain possible alternative solutions")] + public WebCrawlerService(HttpClient httpClient, IPageUrlRetriever urlRetriever) + { + this.httpClient = httpClient; + // configure the client inside constructor if needed (add custom headers etc.) + this.httpClient.DefaultRequestHeaders.Add(HeaderNames.UserAgent, "SearchCrawler"); + this.httpClient.BaseAddress = new Uri(DocumentURLProvider.GetDomainUrl("DancingGoatCore")); + // alternatively specify custom url or load it from settings + // this.httpClient.BaseAddress = new Uri("http://localhost:41489/"); + + this.urlRetriever = urlRetriever; + } + + public async Task CrawlNode(TreeNode node) + { + string url = urlRetriever.Retrieve(node).RelativePath.TrimStart('~'); + // urlRetriever.Retrieve(node).AbsolutePath and no BaseAddress could be used as an alternative + return await CrawlPage(url); + } + + public async Task CrawlPage(string url) + { + var response = await httpClient.GetAsync(url); + return await response.Content.ReadAsStringAsync(); + } +} diff --git a/src/Kentico.Xperience.Lucene.Sample/Search/WebScraperHtmlSanitizer.cs b/src/Kentico.Xperience.Lucene.Sample/Search/WebScraperHtmlSanitizer.cs index b29cc4b..8b8d028 100644 --- a/src/Kentico.Xperience.Lucene.Sample/Search/WebScraperHtmlSanitizer.cs +++ b/src/Kentico.Xperience.Lucene.Sample/Search/WebScraperHtmlSanitizer.cs @@ -69,6 +69,18 @@ public virtual string SanitizeHtmlDocument(string htmlContent) element.Remove(); } + // Removes header + foreach (var element in body.QuerySelectorAll("header")) + { + element.Remove(); + } + + // Removes footer + foreach (var element in body.QuerySelectorAll(".footer-wrapper")) + { + element.Remove(); + } + // Gets the text content of the body element string textContent = body.TextContent; diff --git a/src/Kentico.Xperience.Lucene.Sample/Views/CrawlerSearch/Index.cshtml b/src/Kentico.Xperience.Lucene.Sample/Views/CrawlerSearch/Index.cshtml new file mode 100644 index 0000000..b8babd8 --- /dev/null +++ b/src/Kentico.Xperience.Lucene.Sample/Views/CrawlerSearch/Index.cshtml @@ -0,0 +1,100 @@ +@using Kentico.Xperience.Lucene.Models +@using DancingGoat.Search + +@model LuceneSearchResultModel + +@{ + Dictionary GetRouteData(int page) => + new Dictionary() { { "searchText", Model.Query }, { "page", page.ToString() } }; +} + +

Search

+ + + +
+
+
+
+
+
+ +
+ + + +
+
+
+
+ + +
+
+
+ +@if (!Model.Hits.Any()) +{ + if (!String.IsNullOrWhiteSpace(Model.Query)) + { + @HtmlLocalizer["Sorry, no results match {0}", Model.Query] + } + + return; +} + +@foreach (var item in Model.Hits) +{ +
+
+

+ @item.Title +

+ @item.ContentType +
+
+} + +
+ +