generated from Kentico/repo-template
-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add sample implementation for pages crawler #5
Merged
seangwright
merged 1 commit into
Kentico:main
from
nittin-cz:feature/sample-crawler-implementation
Aug 3, 2023
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
19 changes: 19 additions & 0 deletions
19
src/Kentico.Xperience.Lucene.Sample/Search/CrawlerSearchController.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
using Microsoft.AspNetCore.Mvc; | ||
|
||
namespace DancingGoat.Search; | ||
|
||
public class CrawlerSearchController : Controller | ||
{ | ||
private readonly DancingGoatCrawlerSearchService searchService; | ||
|
||
public CrawlerSearchController(DancingGoatCrawlerSearchService searchService) => this.searchService = searchService; | ||
|
||
[HttpGet] | ||
public IActionResult Index(string query, int pageSize = 10, int page = 1) | ||
{ | ||
var results = searchService.Search(query, pageSize, page); | ||
|
||
return View(results); | ||
} | ||
} | ||
|
49 changes: 49 additions & 0 deletions
49
src/Kentico.Xperience.Lucene.Sample/Search/DancingGoatCrawlerSearchModel.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
using CMS.Core; | ||
using CMS.DocumentEngine; | ||
using CMS.DocumentEngine.Types.DancingGoatCore; | ||
using Kentico.Xperience.Lucene.Attributes; | ||
using Kentico.Xperience.Lucene.Models; | ||
using Kentico.Xperience.Lucene.Services.Implementations; | ||
using Lucene.Net.Documents; | ||
|
||
namespace DancingGoat.Search; | ||
|
||
[IncludedPath("/%", ContentTypes = new string[] { | ||
AboutUs.CLASS_NAME, | ||
Article.CLASS_NAME, | ||
CafeSection.CLASS_NAME, | ||
Coffee.CLASS_NAME, | ||
Contacts.CLASS_NAME, | ||
Home.CLASS_NAME, | ||
})] | ||
public class DancingGoatCrawlerSearchModel : LuceneSearchModel | ||
{ | ||
public const string IndexName = "DancingGoatCrawler"; | ||
|
||
[TextField(true)] | ||
//[ Source(new string[] { nameof(NewsPage.Title), nameof(TreeNode.DocumentName) })] | ||
[Source(new string[] { nameof(TreeNode.DocumentName) })] | ||
public string Title { get; set; } | ||
|
||
[TextField(false)] | ||
public string CrawlerContent { get; set; } | ||
|
||
} | ||
|
||
public class DancingGoatCrawlerLuceneIndexingStrategy : DefaultLuceneIndexingStrategy | ||
{ | ||
public override async Task<object> OnIndexingProperty(TreeNode node, string propertyName, string usedColumn, object foundValue) | ||
{ | ||
object result = foundValue; | ||
if (propertyName == nameof(DancingGoatCrawlerSearchModel.CrawlerContent)) | ||
{ | ||
var htmlSanitizer = Service.Resolve<WebScraperHtmlSanitizer>(); | ||
var webCrawler = Service.Resolve<WebCrawlerService>(); | ||
|
||
string content = await webCrawler.CrawlNode(node); | ||
result = htmlSanitizer.SanitizeHtmlDocument(content); | ||
} | ||
|
||
return result; | ||
} | ||
} |
9 changes: 9 additions & 0 deletions
9
src/Kentico.Xperience.Lucene.Sample/Search/DancingGoatCrawlerSearchResultModel.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
|
||
namespace DancingGoat.Search; | ||
|
||
public class DancingGoatCrawlerSearchResultModel | ||
{ | ||
public string Title { get; set; } | ||
public string ContentType { get; set; } | ||
public string Url { get; set; } | ||
} |
84 changes: 84 additions & 0 deletions
84
src/Kentico.Xperience.Lucene.Sample/Search/DancingGoatCrawlerSearchService.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
using Kentico.Xperience.Lucene; | ||
using Kentico.Xperience.Lucene.Models; | ||
using Kentico.Xperience.Lucene.Services; | ||
using Lucene.Net.Documents; | ||
using Lucene.Net.Search; | ||
using Lucene.Net.Util; | ||
|
||
namespace DancingGoat.Search; | ||
|
||
public class DancingGoatCrawlerSearchService | ||
{ | ||
private const int PHRASE_SLOP = 3; | ||
private const int MAX_RESULTS = 1000; | ||
|
||
private readonly ILuceneIndexService luceneIndexService; | ||
|
||
public DancingGoatCrawlerSearchService(ILuceneIndexService luceneIndexService) => this.luceneIndexService = luceneIndexService; | ||
|
||
public LuceneSearchResultModel<DancingGoatCrawlerSearchResultModel> Search(string searchText, int pageSize = 20, int page = 1) | ||
{ | ||
var index = IndexStore.Instance.GetIndex(DancingGoatCrawlerSearchModel.IndexName) ?? throw new Exception($"Index {DancingGoatSearchModel.IndexName} was not found!!!"); | ||
pageSize = Math.Max(1, pageSize); | ||
page = Math.Max(1, page); | ||
int offset = pageSize * (page - 1); | ||
int limit = pageSize; | ||
|
||
var queryBuilder = new QueryBuilder(index.Analyzer); | ||
|
||
var query = string.IsNullOrWhiteSpace(searchText) | ||
? new MatchAllDocsQuery() | ||
: GetTermQuery(queryBuilder, searchText); | ||
|
||
var result = luceneIndexService.UseSearcher( | ||
index, | ||
(searcher) => | ||
{ | ||
var topDocs = searcher.Search(query, MAX_RESULTS); | ||
return new LuceneSearchResultModel<DancingGoatCrawlerSearchResultModel>() | ||
{ | ||
Query = searchText ?? "", | ||
Page = page, | ||
PageSize = pageSize, | ||
TotalPages = topDocs.TotalHits <= 0 ? 0 : ((topDocs.TotalHits - 1) / pageSize) + 1, | ||
TotalHits = topDocs.TotalHits, | ||
Hits = topDocs.ScoreDocs | ||
.Skip(offset) | ||
.Take(limit) | ||
.Select(d => MapToResultItem(searcher.Doc(d.Doc))) | ||
.ToList(), | ||
}; | ||
} | ||
); | ||
|
||
return result; | ||
} | ||
|
||
private static Query GetTermQuery(QueryBuilder queryBuilder, string searchText) | ||
{ | ||
var titlePhrase = queryBuilder.CreatePhraseQuery(nameof(DancingGoatCrawlerSearchModel.Title), searchText, PHRASE_SLOP); | ||
titlePhrase.Boost = 5; | ||
var contentPhrase = queryBuilder.CreatePhraseQuery(nameof(DancingGoatCrawlerSearchModel.CrawlerContent), searchText, PHRASE_SLOP); | ||
contentPhrase.Boost = 1; | ||
var titleShould = queryBuilder.CreateBooleanQuery(nameof(DancingGoatCrawlerSearchModel.CrawlerContent), searchText, Occur.SHOULD); | ||
titleShould.Boost = 0.5f; | ||
var contentShould = queryBuilder.CreateBooleanQuery(nameof(DancingGoatCrawlerSearchModel.CrawlerContent), searchText, Occur.SHOULD); | ||
contentShould.Boost = 0.1f; | ||
|
||
return new BooleanQuery | ||
{ | ||
{ titlePhrase, Occur.SHOULD }, | ||
{ contentPhrase, Occur.SHOULD }, | ||
{ titleShould, Occur.SHOULD }, | ||
{ contentShould, Occur.SHOULD }, | ||
}; | ||
} | ||
|
||
private DancingGoatCrawlerSearchResultModel MapToResultItem(Document doc) => new() | ||
{ | ||
Title = doc.Get(nameof(DancingGoatCrawlerSearchModel.Title)), | ||
Url = doc.Get(nameof(DancingGoatCrawlerSearchModel.Url)), | ||
ContentType = doc.Get(nameof(DancingGoatCrawlerSearchModel.ClassName)), | ||
}; | ||
} |
37 changes: 37 additions & 0 deletions
37
src/Kentico.Xperience.Lucene.Sample/Search/WebCrawlerService.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
using CMS.DocumentEngine; | ||
using Kentico.Content.Web.Mvc; | ||
using Microsoft.Net.Http.Headers; | ||
|
||
namespace DancingGoat.Search; | ||
|
||
public class WebCrawlerService | ||
{ | ||
private readonly HttpClient httpClient; | ||
private readonly IPageUrlRetriever urlRetriever; | ||
|
||
[System.Diagnostics.CodeAnalysis.SuppressMessage("Major Code Smell", "S125:Sections of code should not be commented out", Justification = "Comments contain possible alternative solutions")] | ||
public WebCrawlerService(HttpClient httpClient, IPageUrlRetriever urlRetriever) | ||
{ | ||
this.httpClient = httpClient; | ||
// configure the client inside constructor if needed (add custom headers etc.) | ||
this.httpClient.DefaultRequestHeaders.Add(HeaderNames.UserAgent, "SearchCrawler"); | ||
this.httpClient.BaseAddress = new Uri(DocumentURLProvider.GetDomainUrl("DancingGoatCore")); | ||
// alternatively specify custom url or load it from settings | ||
// this.httpClient.BaseAddress = new Uri("http://localhost:41489/"); | ||
|
||
this.urlRetriever = urlRetriever; | ||
} | ||
|
||
public async Task<string> CrawlNode(TreeNode node) | ||
{ | ||
string url = urlRetriever.Retrieve(node).RelativePath.TrimStart('~'); | ||
// urlRetriever.Retrieve(node).AbsolutePath and no BaseAddress could be used as an alternative | ||
return await CrawlPage(url); | ||
} | ||
|
||
public async Task<string> CrawlPage(string url) | ||
{ | ||
var response = await httpClient.GetAsync(url); | ||
return await response.Content.ReadAsStringAsync(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
100 changes: 100 additions & 0 deletions
100
src/Kentico.Xperience.Lucene.Sample/Views/CrawlerSearch/Index.cshtml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
@using Kentico.Xperience.Lucene.Models | ||
@using DancingGoat.Search | ||
|
||
@model LuceneSearchResultModel<DancingGoatCrawlerSearchResultModel> | ||
|
||
@{ | ||
Dictionary<string, string> GetRouteData(int page) => | ||
new Dictionary<string, string>() { { "searchText", Model.Query }, { "page", page.ToString() } }; | ||
} | ||
|
||
<h1>Search</h1> | ||
|
||
<style> | ||
.form-field { | ||
margin-bottom: 0.8rem; | ||
} | ||
</style> | ||
|
||
<div class="row" style="padding: 1rem;"> | ||
<div class="col-12"> | ||
<form asp-action="Index" method="get"> | ||
<div class="row"> | ||
<div class="col-md-12"> | ||
<div class="form-field"> | ||
<label class="control-label" asp-for="@Model.Query"></label> | ||
<div class="editing-form-control-nested-control"> | ||
<input class="form-control" asp-for="@Model.Query"> | ||
<input type="hidden" asp-for="@Model.Page" /> | ||
<input type="hidden" asp-for="@Model.PageSize" /> | ||
</div> | ||
</div> | ||
</div> | ||
</div> | ||
|
||
<input type="submit" value="Submit"> | ||
</form> | ||
</div> | ||
</div> | ||
|
||
@if (!Model.Hits.Any()) | ||
{ | ||
if (!String.IsNullOrWhiteSpace(Model.Query)) | ||
{ | ||
@HtmlLocalizer["Sorry, no results match {0}", Model.Query] | ||
} | ||
|
||
return; | ||
} | ||
|
||
@foreach (var item in Model.Hits) | ||
{ | ||
<div class="row search-tile"> | ||
<div class="col-md-12 col-lg-12 search-tile-content"> | ||
<h3 class="h4 search-tile-title"> | ||
<a href="@item.Url">@item.Title</a> | ||
</h3> | ||
@item.ContentType | ||
</div> | ||
</div> | ||
} | ||
|
||
<div class="pagination-container"> | ||
<ul class="pagination"> | ||
@if (Model.Page > 1) | ||
{ | ||
<li class="PagedList-skipToPrevious"> | ||
<a asp-controller="Search" asp-all-route-data="GetRouteData(Model.Page - 1)"> | ||
@HtmlLocalizer["previous"] | ||
</a> | ||
</li> | ||
} | ||
|
||
@for (int pageNumber = 1; pageNumber <= Model.TotalPages; pageNumber++) | ||
{ | ||
if (pageNumber == Model.Page) | ||
{ | ||
<li class="active"> | ||
<span> | ||
@pageNumber | ||
</span> | ||
</li> | ||
} | ||
else | ||
{ | ||
<li> | ||
<a asp-controller="Search" asp-all-route-data="GetRouteData(pageNumber)">@pageNumber</a> | ||
</li> | ||
} | ||
} | ||
|
||
@if (Model.Page < Model.TotalPages) | ||
{ | ||
<li class="PagedList-skipToNext"> | ||
<a asp-controller="Search" asp-all-route-data="GetRouteData(Model.Page + 1)"> | ||
@HtmlLocalizer["next"] | ||
</a> | ||
</li> | ||
} | ||
</ul> | ||
</div> |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍