generated from Kentico/repo-template
-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add sample implementation for pages crawler
- Loading branch information
Showing
8 changed files
with
319 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
19 changes: 19 additions & 0 deletions
19
src/Kentico.Xperience.Lucene.Sample/Search/CrawlerSearchController.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
using Microsoft.AspNetCore.Mvc; | ||
|
||
namespace DancingGoat.Search; | ||
|
||
public class CrawlerSearchController : Controller | ||
{ | ||
private readonly DancingGoatCrawlerSearchService searchService; | ||
|
||
public CrawlerSearchController(DancingGoatCrawlerSearchService searchService) => this.searchService = searchService; | ||
|
||
[HttpGet] | ||
public IActionResult Index(string query, int pageSize = 10, int page = 1) | ||
{ | ||
var results = searchService.Search(query, pageSize, page); | ||
|
||
return View(results); | ||
} | ||
} | ||
|
49 changes: 49 additions & 0 deletions
49
src/Kentico.Xperience.Lucene.Sample/Search/DancingGoatCrawlerSearchModel.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
using CMS.Core; | ||
using CMS.DocumentEngine; | ||
using CMS.DocumentEngine.Types.DancingGoatCore; | ||
using Kentico.Xperience.Lucene.Attributes; | ||
using Kentico.Xperience.Lucene.Models; | ||
using Kentico.Xperience.Lucene.Services.Implementations; | ||
using Lucene.Net.Documents; | ||
|
||
namespace DancingGoat.Search; | ||
|
||
[IncludedPath("/%", ContentTypes = new string[] { | ||
AboutUs.CLASS_NAME, | ||
Article.CLASS_NAME, | ||
CafeSection.CLASS_NAME, | ||
Coffee.CLASS_NAME, | ||
Contacts.CLASS_NAME, | ||
Home.CLASS_NAME, | ||
})] | ||
public class DancingGoatCrawlerSearchModel : LuceneSearchModel | ||
{ | ||
public const string IndexName = "DancingGoatCrawler"; | ||
|
||
[TextField(true)] | ||
//[ Source(new string[] { nameof(NewsPage.Title), nameof(TreeNode.DocumentName) })] | ||
[Source(new string[] { nameof(TreeNode.DocumentName) })] | ||
public string Title { get; set; } | ||
|
||
[TextField(false)] | ||
public string CrawlerContent { get; set; } | ||
|
||
} | ||
|
||
public class DancingGoatCrawlerLuceneIndexingStrategy : DefaultLuceneIndexingStrategy | ||
{ | ||
public override async Task<object> OnIndexingProperty(TreeNode node, string propertyName, string usedColumn, object foundValue) | ||
{ | ||
object result = foundValue; | ||
if (propertyName == nameof(DancingGoatCrawlerSearchModel.CrawlerContent)) | ||
{ | ||
var htmlSanitizer = Service.Resolve<WebScraperHtmlSanitizer>(); | ||
var webCrawler = Service.Resolve<WebCrawlerService>(); | ||
|
||
string content = await webCrawler.CrawlNode(node); | ||
result = htmlSanitizer.SanitizeHtmlDocument(content); | ||
} | ||
|
||
return result; | ||
} | ||
} |
9 changes: 9 additions & 0 deletions
9
src/Kentico.Xperience.Lucene.Sample/Search/DancingGoatCrawlerSearchResultModel.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
|
||
namespace DancingGoat.Search; | ||
|
||
public class DancingGoatCrawlerSearchResultModel | ||
{ | ||
public string Title { get; set; } | ||
public string ContentType { get; set; } | ||
public string Url { get; set; } | ||
} |
84 changes: 84 additions & 0 deletions
84
src/Kentico.Xperience.Lucene.Sample/Search/DancingGoatCrawlerSearchService.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
using Kentico.Xperience.Lucene; | ||
using Kentico.Xperience.Lucene.Models; | ||
using Kentico.Xperience.Lucene.Services; | ||
using Lucene.Net.Documents; | ||
using Lucene.Net.Search; | ||
using Lucene.Net.Util; | ||
|
||
namespace DancingGoat.Search; | ||
|
||
public class DancingGoatCrawlerSearchService | ||
{ | ||
private const int PHRASE_SLOP = 3; | ||
private const int MAX_RESULTS = 1000; | ||
|
||
private readonly ILuceneIndexService luceneIndexService; | ||
|
||
public DancingGoatCrawlerSearchService(ILuceneIndexService luceneIndexService) => this.luceneIndexService = luceneIndexService; | ||
|
||
public LuceneSearchResultModel<DancingGoatCrawlerSearchResultModel> Search(string searchText, int pageSize = 20, int page = 1) | ||
{ | ||
var index = IndexStore.Instance.GetIndex(DancingGoatCrawlerSearchModel.IndexName) ?? throw new Exception($"Index {DancingGoatSearchModel.IndexName} was not found!!!"); | ||
pageSize = Math.Max(1, pageSize); | ||
page = Math.Max(1, page); | ||
int offset = pageSize * (page - 1); | ||
int limit = pageSize; | ||
|
||
var queryBuilder = new QueryBuilder(index.Analyzer); | ||
|
||
var query = string.IsNullOrWhiteSpace(searchText) | ||
? new MatchAllDocsQuery() | ||
: GetTermQuery(queryBuilder, searchText); | ||
|
||
var result = luceneIndexService.UseSearcher( | ||
index, | ||
(searcher) => | ||
{ | ||
var topDocs = searcher.Search(query, MAX_RESULTS); | ||
return new LuceneSearchResultModel<DancingGoatCrawlerSearchResultModel>() | ||
{ | ||
Query = searchText ?? "", | ||
Page = page, | ||
PageSize = pageSize, | ||
TotalPages = topDocs.TotalHits <= 0 ? 0 : ((topDocs.TotalHits - 1) / pageSize) + 1, | ||
TotalHits = topDocs.TotalHits, | ||
Hits = topDocs.ScoreDocs | ||
.Skip(offset) | ||
.Take(limit) | ||
.Select(d => MapToResultItem(searcher.Doc(d.Doc))) | ||
.ToList(), | ||
}; | ||
} | ||
); | ||
|
||
return result; | ||
} | ||
|
||
private static Query GetTermQuery(QueryBuilder queryBuilder, string searchText) | ||
{ | ||
var titlePhrase = queryBuilder.CreatePhraseQuery(nameof(DancingGoatCrawlerSearchModel.Title), searchText, PHRASE_SLOP); | ||
titlePhrase.Boost = 5; | ||
var contentPhrase = queryBuilder.CreatePhraseQuery(nameof(DancingGoatCrawlerSearchModel.CrawlerContent), searchText, PHRASE_SLOP); | ||
contentPhrase.Boost = 1; | ||
var titleShould = queryBuilder.CreateBooleanQuery(nameof(DancingGoatCrawlerSearchModel.CrawlerContent), searchText, Occur.SHOULD); | ||
titleShould.Boost = 0.5f; | ||
var contentShould = queryBuilder.CreateBooleanQuery(nameof(DancingGoatCrawlerSearchModel.CrawlerContent), searchText, Occur.SHOULD); | ||
contentShould.Boost = 0.1f; | ||
|
||
return new BooleanQuery | ||
{ | ||
{ titlePhrase, Occur.SHOULD }, | ||
{ contentPhrase, Occur.SHOULD }, | ||
{ titleShould, Occur.SHOULD }, | ||
{ contentShould, Occur.SHOULD }, | ||
}; | ||
} | ||
|
||
private DancingGoatCrawlerSearchResultModel MapToResultItem(Document doc) => new() | ||
{ | ||
Title = doc.Get(nameof(DancingGoatCrawlerSearchModel.Title)), | ||
Url = doc.Get(nameof(DancingGoatCrawlerSearchModel.Url)), | ||
ContentType = doc.Get(nameof(DancingGoatCrawlerSearchModel.ClassName)), | ||
}; | ||
} |
37 changes: 37 additions & 0 deletions
37
src/Kentico.Xperience.Lucene.Sample/Search/WebCrawlerService.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
using CMS.DocumentEngine; | ||
using Kentico.Content.Web.Mvc; | ||
using Microsoft.Net.Http.Headers; | ||
|
||
namespace DancingGoat.Search; | ||
|
||
public class WebCrawlerService | ||
{ | ||
private readonly HttpClient httpClient; | ||
private readonly IPageUrlRetriever urlRetriever; | ||
|
||
[System.Diagnostics.CodeAnalysis.SuppressMessage("Major Code Smell", "S125:Sections of code should not be commented out", Justification = "Comments contain possible alternative solutions")] | ||
public WebCrawlerService(HttpClient httpClient, IPageUrlRetriever urlRetriever) | ||
{ | ||
this.httpClient = httpClient; | ||
// configure the client inside constructor if needed (add custom headers etc.) | ||
this.httpClient.DefaultRequestHeaders.Add(HeaderNames.UserAgent, "SearchCrawler"); | ||
this.httpClient.BaseAddress = new Uri(DocumentURLProvider.GetDomainUrl("DancingGoatCore")); | ||
// alternatively specify custom url or load it from settings | ||
// this.httpClient.BaseAddress = new Uri("http://localhost:41489/"); | ||
|
||
this.urlRetriever = urlRetriever; | ||
} | ||
|
||
public async Task<string> CrawlNode(TreeNode node) | ||
{ | ||
string url = urlRetriever.Retrieve(node).RelativePath.TrimStart('~'); | ||
// urlRetriever.Retrieve(node).AbsolutePath and no BaseAddress could be used as an alternative | ||
return await CrawlPage(url); | ||
} | ||
|
||
public async Task<string> CrawlPage(string url) | ||
{ | ||
var response = await httpClient.GetAsync(url); | ||
return await response.Content.ReadAsStringAsync(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
100 changes: 100 additions & 0 deletions
100
src/Kentico.Xperience.Lucene.Sample/Views/CrawlerSearch/Index.cshtml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
@using Kentico.Xperience.Lucene.Models | ||
@using DancingGoat.Search | ||
|
||
@model LuceneSearchResultModel<DancingGoatCrawlerSearchResultModel> | ||
|
||
@{ | ||
Dictionary<string, string> GetRouteData(int page) => | ||
new Dictionary<string, string>() { { "searchText", Model.Query }, { "page", page.ToString() } }; | ||
} | ||
|
||
<h1>Search</h1> | ||
|
||
<style> | ||
.form-field { | ||
margin-bottom: 0.8rem; | ||
} | ||
</style> | ||
|
||
<div class="row" style="padding: 1rem;"> | ||
<div class="col-12"> | ||
<form asp-action="Index" method="get"> | ||
<div class="row"> | ||
<div class="col-md-12"> | ||
<div class="form-field"> | ||
<label class="control-label" asp-for="@Model.Query"></label> | ||
<div class="editing-form-control-nested-control"> | ||
<input class="form-control" asp-for="@Model.Query"> | ||
<input type="hidden" asp-for="@Model.Page" /> | ||
<input type="hidden" asp-for="@Model.PageSize" /> | ||
</div> | ||
</div> | ||
</div> | ||
</div> | ||
|
||
<input type="submit" value="Submit"> | ||
</form> | ||
</div> | ||
</div> | ||
|
||
@if (!Model.Hits.Any()) | ||
{ | ||
if (!String.IsNullOrWhiteSpace(Model.Query)) | ||
{ | ||
@HtmlLocalizer["Sorry, no results match {0}", Model.Query] | ||
} | ||
|
||
return; | ||
} | ||
|
||
@foreach (var item in Model.Hits) | ||
{ | ||
<div class="row search-tile"> | ||
<div class="col-md-12 col-lg-12 search-tile-content"> | ||
<h3 class="h4 search-tile-title"> | ||
<a href="@item.Url">@item.Title</a> | ||
</h3> | ||
@item.ContentType | ||
</div> | ||
</div> | ||
} | ||
|
||
<div class="pagination-container"> | ||
<ul class="pagination"> | ||
@if (Model.Page > 1) | ||
{ | ||
<li class="PagedList-skipToPrevious"> | ||
<a asp-controller="Search" asp-all-route-data="GetRouteData(Model.Page - 1)"> | ||
@HtmlLocalizer["previous"] | ||
</a> | ||
</li> | ||
} | ||
|
||
@for (int pageNumber = 1; pageNumber <= Model.TotalPages; pageNumber++) | ||
{ | ||
if (pageNumber == Model.Page) | ||
{ | ||
<li class="active"> | ||
<span> | ||
@pageNumber | ||
</span> | ||
</li> | ||
} | ||
else | ||
{ | ||
<li> | ||
<a asp-controller="Search" asp-all-route-data="GetRouteData(pageNumber)">@pageNumber</a> | ||
</li> | ||
} | ||
} | ||
|
||
@if (Model.Page < Model.TotalPages) | ||
{ | ||
<li class="PagedList-skipToNext"> | ||
<a asp-controller="Search" asp-all-route-data="GetRouteData(Model.Page + 1)"> | ||
@HtmlLocalizer["next"] | ||
</a> | ||
</li> | ||
} | ||
</ul> | ||
</div> |