Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add sample implementation for pages crawler #5

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion src/Kentico.Xperience.Lucene.Sample/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
The constraint ensures that broken URLs lead to a "404 page not found" page and are not handled by a controller dedicated to the component or
to a page handled by the content tree-based router (which would lead to an exception).
*/
const string CONSTRAINT_FOR_NON_ROUTER_PAGE_CONTROLLERS = "Account|Consent|Subscription|Coffees|Search";
const string CONSTRAINT_FOR_NON_ROUTER_PAGE_CONTROLLERS = "Account|Consent|Subscription|Coffees|Search|CrawlerSearch";

var builder = WebApplication.CreateBuilder(args);

Expand Down Expand Up @@ -95,9 +95,17 @@ The constraint ensures that broken URLs lead to a "404 page not found" page and
DancingGoatSearchModel.IndexName,
indexPath: null,
new DancingGoatLuceneIndexingStrategy()),
new LuceneIndex(
typeof(DancingGoatCrawlerSearchModel),
new StandardAnalyzer(Lucene.Net.Util.LuceneVersion.LUCENE_48),
DancingGoatCrawlerSearchModel.IndexName,
indexPath: null,
new DancingGoatCrawlerLuceneIndexingStrategy()),
});
builder.Services.AddSingleton<WebScraperHtmlSanitizer>();
builder.Services.AddSingleton<DancingGoatSearchService>();
builder.Services.AddHttpClient<WebCrawlerService>();
builder.Services.AddSingleton<DancingGoatCrawlerSearchService>();

ConfigureMembershipServices(builder.Services);
ConfigurePageBuilderFilters();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
using Microsoft.AspNetCore.Mvc;

namespace DancingGoat.Search;

public class CrawlerSearchController : Controller
{
private readonly DancingGoatCrawlerSearchService searchService;

public CrawlerSearchController(DancingGoatCrawlerSearchService searchService) => this.searchService = searchService;

[HttpGet]
public IActionResult Index(string query, int pageSize = 10, int page = 1)
{
var results = searchService.Search(query, pageSize, page);

return View(results);
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
using CMS.Core;
using CMS.DocumentEngine;
using CMS.DocumentEngine.Types.DancingGoatCore;
using Kentico.Xperience.Lucene.Attributes;
using Kentico.Xperience.Lucene.Models;
using Kentico.Xperience.Lucene.Services.Implementations;
using Lucene.Net.Documents;

namespace DancingGoat.Search;

[IncludedPath("/%", ContentTypes = new string[] {
AboutUs.CLASS_NAME,
Article.CLASS_NAME,
CafeSection.CLASS_NAME,
Coffee.CLASS_NAME,
Contacts.CLASS_NAME,
Home.CLASS_NAME,
})]
public class DancingGoatCrawlerSearchModel : LuceneSearchModel
{
public const string IndexName = "DancingGoatCrawler";

[TextField(true)]
//[ Source(new string[] { nameof(NewsPage.Title), nameof(TreeNode.DocumentName) })]
[Source(new string[] { nameof(TreeNode.DocumentName) })]
public string Title { get; set; }

[TextField(false)]
public string CrawlerContent { get; set; }

}

public class DancingGoatCrawlerLuceneIndexingStrategy : DefaultLuceneIndexingStrategy
{
public override async Task<object> OnIndexingProperty(TreeNode node, string propertyName, string usedColumn, object foundValue)
{
object result = foundValue;
if (propertyName == nameof(DancingGoatCrawlerSearchModel.CrawlerContent))
{
var htmlSanitizer = Service.Resolve<WebScraperHtmlSanitizer>();
var webCrawler = Service.Resolve<WebCrawlerService>();

string content = await webCrawler.CrawlNode(node);
result = htmlSanitizer.SanitizeHtmlDocument(content);
}

return result;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@

namespace DancingGoat.Search;

public class DancingGoatCrawlerSearchResultModel
{
public string Title { get; set; }
public string ContentType { get; set; }
public string Url { get; set; }
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
using Kentico.Xperience.Lucene;
using Kentico.Xperience.Lucene.Models;
using Kentico.Xperience.Lucene.Services;
using Lucene.Net.Documents;
using Lucene.Net.Search;
using Lucene.Net.Util;

namespace DancingGoat.Search;

public class DancingGoatCrawlerSearchService
{
private const int PHRASE_SLOP = 3;
private const int MAX_RESULTS = 1000;

private readonly ILuceneIndexService luceneIndexService;

public DancingGoatCrawlerSearchService(ILuceneIndexService luceneIndexService) => this.luceneIndexService = luceneIndexService;

public LuceneSearchResultModel<DancingGoatCrawlerSearchResultModel> Search(string searchText, int pageSize = 20, int page = 1)
{
var index = IndexStore.Instance.GetIndex(DancingGoatCrawlerSearchModel.IndexName) ?? throw new Exception($"Index {DancingGoatSearchModel.IndexName} was not found!!!");
pageSize = Math.Max(1, pageSize);
page = Math.Max(1, page);
int offset = pageSize * (page - 1);
int limit = pageSize;

var queryBuilder = new QueryBuilder(index.Analyzer);

var query = string.IsNullOrWhiteSpace(searchText)
? new MatchAllDocsQuery()
: GetTermQuery(queryBuilder, searchText);

var result = luceneIndexService.UseSearcher(
index,
(searcher) =>
{
var topDocs = searcher.Search(query, MAX_RESULTS);
return new LuceneSearchResultModel<DancingGoatCrawlerSearchResultModel>()
{
Query = searchText ?? "",
Page = page,
PageSize = pageSize,
TotalPages = topDocs.TotalHits <= 0 ? 0 : ((topDocs.TotalHits - 1) / pageSize) + 1,
TotalHits = topDocs.TotalHits,
Hits = topDocs.ScoreDocs
.Skip(offset)
.Take(limit)
.Select(d => MapToResultItem(searcher.Doc(d.Doc)))
.ToList(),
};
}
);

return result;
}

private static Query GetTermQuery(QueryBuilder queryBuilder, string searchText)
{
var titlePhrase = queryBuilder.CreatePhraseQuery(nameof(DancingGoatCrawlerSearchModel.Title), searchText, PHRASE_SLOP);
titlePhrase.Boost = 5;
var contentPhrase = queryBuilder.CreatePhraseQuery(nameof(DancingGoatCrawlerSearchModel.CrawlerContent), searchText, PHRASE_SLOP);
contentPhrase.Boost = 1;
var titleShould = queryBuilder.CreateBooleanQuery(nameof(DancingGoatCrawlerSearchModel.CrawlerContent), searchText, Occur.SHOULD);
titleShould.Boost = 0.5f;
var contentShould = queryBuilder.CreateBooleanQuery(nameof(DancingGoatCrawlerSearchModel.CrawlerContent), searchText, Occur.SHOULD);
contentShould.Boost = 0.1f;

return new BooleanQuery
{
{ titlePhrase, Occur.SHOULD },
{ contentPhrase, Occur.SHOULD },
{ titleShould, Occur.SHOULD },
{ contentShould, Occur.SHOULD },
};
}

private DancingGoatCrawlerSearchResultModel MapToResultItem(Document doc) => new()
{
Title = doc.Get(nameof(DancingGoatCrawlerSearchModel.Title)),
Url = doc.Get(nameof(DancingGoatCrawlerSearchModel.Url)),
ContentType = doc.Get(nameof(DancingGoatCrawlerSearchModel.ClassName)),
};
}
37 changes: 37 additions & 0 deletions src/Kentico.Xperience.Lucene.Sample/Search/WebCrawlerService.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
using CMS.DocumentEngine;
using Kentico.Content.Web.Mvc;
using Microsoft.Net.Http.Headers;

namespace DancingGoat.Search;

public class WebCrawlerService
{
private readonly HttpClient httpClient;
private readonly IPageUrlRetriever urlRetriever;

[System.Diagnostics.CodeAnalysis.SuppressMessage("Major Code Smell", "S125:Sections of code should not be commented out", Justification = "Comments contain possible alternative solutions")]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

public WebCrawlerService(HttpClient httpClient, IPageUrlRetriever urlRetriever)
{
this.httpClient = httpClient;
// configure the client inside constructor if needed (add custom headers etc.)
this.httpClient.DefaultRequestHeaders.Add(HeaderNames.UserAgent, "SearchCrawler");
this.httpClient.BaseAddress = new Uri(DocumentURLProvider.GetDomainUrl("DancingGoatCore"));
// alternatively specify custom url or load it from settings
// this.httpClient.BaseAddress = new Uri("http://localhost:41489/");

this.urlRetriever = urlRetriever;
}

public async Task<string> CrawlNode(TreeNode node)
{
string url = urlRetriever.Retrieve(node).RelativePath.TrimStart('~');
// urlRetriever.Retrieve(node).AbsolutePath and no BaseAddress could be used as an alternative
return await CrawlPage(url);
}

public async Task<string> CrawlPage(string url)
{
var response = await httpClient.GetAsync(url);
return await response.Content.ReadAsStringAsync();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,18 @@ public virtual string SanitizeHtmlDocument(string htmlContent)
element.Remove();
}

// Removes header
foreach (var element in body.QuerySelectorAll("header"))
{
element.Remove();
}

// Removes footer
foreach (var element in body.QuerySelectorAll(".footer-wrapper"))
{
element.Remove();
}

// Gets the text content of the body element
string textContent = body.TextContent;

Expand Down
100 changes: 100 additions & 0 deletions src/Kentico.Xperience.Lucene.Sample/Views/CrawlerSearch/Index.cshtml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
@using Kentico.Xperience.Lucene.Models
@using DancingGoat.Search

@model LuceneSearchResultModel<DancingGoatCrawlerSearchResultModel>

@{
Dictionary<string, string> GetRouteData(int page) =>
new Dictionary<string, string>() { { "searchText", Model.Query }, { "page", page.ToString() } };
}

<h1>Search</h1>

<style>
.form-field {
margin-bottom: 0.8rem;
}
</style>

<div class="row" style="padding: 1rem;">
<div class="col-12">
<form asp-action="Index" method="get">
<div class="row">
<div class="col-md-12">
<div class="form-field">
<label class="control-label" asp-for="@Model.Query"></label>
<div class="editing-form-control-nested-control">
<input class="form-control" asp-for="@Model.Query">
<input type="hidden" asp-for="@Model.Page" />
<input type="hidden" asp-for="@Model.PageSize" />
</div>
</div>
</div>
</div>

<input type="submit" value="Submit">
</form>
</div>
</div>

@if (!Model.Hits.Any())
{
if (!String.IsNullOrWhiteSpace(Model.Query))
{
@HtmlLocalizer["Sorry, no results match {0}", Model.Query]
}

return;
}

@foreach (var item in Model.Hits)
{
<div class="row search-tile">
<div class="col-md-12 col-lg-12 search-tile-content">
<h3 class="h4 search-tile-title">
<a href="@item.Url">@item.Title</a>
</h3>
@item.ContentType
</div>
</div>
}

<div class="pagination-container">
<ul class="pagination">
@if (Model.Page > 1)
{
<li class="PagedList-skipToPrevious">
<a asp-controller="Search" asp-all-route-data="GetRouteData(Model.Page - 1)">
@HtmlLocalizer["previous"]
</a>
</li>
}

@for (int pageNumber = 1; pageNumber <= Model.TotalPages; pageNumber++)
{
if (pageNumber == Model.Page)
{
<li class="active">
<span>
@pageNumber
</span>
</li>
}
else
{
<li>
<a asp-controller="Search" asp-all-route-data="GetRouteData(pageNumber)">@pageNumber</a>
</li>
}
}

@if (Model.Page < Model.TotalPages)
{
<li class="PagedList-skipToNext">
<a asp-controller="Search" asp-all-route-data="GetRouteData(Model.Page + 1)">
@HtmlLocalizer["next"]
</a>
</li>
}
</ul>
</div>
Loading