Skip to content

Commit

Permalink
Merge pull request #5 from nittin-cz/feature/sample-crawler-implement…
Browse files Browse the repository at this point in the history
…ation
  • Loading branch information
seangwright authored Aug 3, 2023
2 parents fd8cbfd + 2e4a6ed commit cca72c8
Show file tree
Hide file tree
Showing 8 changed files with 319 additions and 1 deletion.
10 changes: 9 additions & 1 deletion src/Kentico.Xperience.Lucene.Sample/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
The constraint ensures that broken URLs lead to a "404 page not found" page and are not handled by a controller dedicated to the component or
to a page handled by the content tree-based router (which would lead to an exception).
*/
const string CONSTRAINT_FOR_NON_ROUTER_PAGE_CONTROLLERS = "Account|Consent|Subscription|Coffees|Search";
const string CONSTRAINT_FOR_NON_ROUTER_PAGE_CONTROLLERS = "Account|Consent|Subscription|Coffees|Search|CrawlerSearch";

var builder = WebApplication.CreateBuilder(args);

Expand Down Expand Up @@ -95,9 +95,17 @@ The constraint ensures that broken URLs lead to a "404 page not found" page and
DancingGoatSearchModel.IndexName,
indexPath: null,
new DancingGoatLuceneIndexingStrategy()),
new LuceneIndex(
typeof(DancingGoatCrawlerSearchModel),
new StandardAnalyzer(Lucene.Net.Util.LuceneVersion.LUCENE_48),
DancingGoatCrawlerSearchModel.IndexName,
indexPath: null,
new DancingGoatCrawlerLuceneIndexingStrategy()),
});
builder.Services.AddSingleton<WebScraperHtmlSanitizer>();
builder.Services.AddSingleton<DancingGoatSearchService>();
builder.Services.AddHttpClient<WebCrawlerService>();
builder.Services.AddSingleton<DancingGoatCrawlerSearchService>();

ConfigureMembershipServices(builder.Services);
ConfigurePageBuilderFilters();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
using Microsoft.AspNetCore.Mvc;

namespace DancingGoat.Search;

public class CrawlerSearchController : Controller
{
private readonly DancingGoatCrawlerSearchService searchService;

public CrawlerSearchController(DancingGoatCrawlerSearchService searchService) => this.searchService = searchService;

[HttpGet]
public IActionResult Index(string query, int pageSize = 10, int page = 1)
{
var results = searchService.Search(query, pageSize, page);

return View(results);
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
using CMS.Core;
using CMS.DocumentEngine;
using CMS.DocumentEngine.Types.DancingGoatCore;
using Kentico.Xperience.Lucene.Attributes;
using Kentico.Xperience.Lucene.Models;
using Kentico.Xperience.Lucene.Services.Implementations;
using Lucene.Net.Documents;

namespace DancingGoat.Search;

[IncludedPath("/%", ContentTypes = new string[] {
AboutUs.CLASS_NAME,
Article.CLASS_NAME,
CafeSection.CLASS_NAME,
Coffee.CLASS_NAME,
Contacts.CLASS_NAME,
Home.CLASS_NAME,
})]
public class DancingGoatCrawlerSearchModel : LuceneSearchModel
{
public const string IndexName = "DancingGoatCrawler";

[TextField(true)]
//[ Source(new string[] { nameof(NewsPage.Title), nameof(TreeNode.DocumentName) })]
[Source(new string[] { nameof(TreeNode.DocumentName) })]
public string Title { get; set; }

[TextField(false)]
public string CrawlerContent { get; set; }

}

public class DancingGoatCrawlerLuceneIndexingStrategy : DefaultLuceneIndexingStrategy
{
public override async Task<object> OnIndexingProperty(TreeNode node, string propertyName, string usedColumn, object foundValue)
{
object result = foundValue;
if (propertyName == nameof(DancingGoatCrawlerSearchModel.CrawlerContent))
{
var htmlSanitizer = Service.Resolve<WebScraperHtmlSanitizer>();
var webCrawler = Service.Resolve<WebCrawlerService>();

string content = await webCrawler.CrawlNode(node);
result = htmlSanitizer.SanitizeHtmlDocument(content);
}

return result;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@

namespace DancingGoat.Search;

public class DancingGoatCrawlerSearchResultModel
{
public string Title { get; set; }
public string ContentType { get; set; }
public string Url { get; set; }
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
using Kentico.Xperience.Lucene;
using Kentico.Xperience.Lucene.Models;
using Kentico.Xperience.Lucene.Services;
using Lucene.Net.Documents;
using Lucene.Net.Search;
using Lucene.Net.Util;

namespace DancingGoat.Search;

public class DancingGoatCrawlerSearchService
{
private const int PHRASE_SLOP = 3;
private const int MAX_RESULTS = 1000;

private readonly ILuceneIndexService luceneIndexService;

public DancingGoatCrawlerSearchService(ILuceneIndexService luceneIndexService) => this.luceneIndexService = luceneIndexService;

public LuceneSearchResultModel<DancingGoatCrawlerSearchResultModel> Search(string searchText, int pageSize = 20, int page = 1)
{
var index = IndexStore.Instance.GetIndex(DancingGoatCrawlerSearchModel.IndexName) ?? throw new Exception($"Index {DancingGoatSearchModel.IndexName} was not found!!!");
pageSize = Math.Max(1, pageSize);
page = Math.Max(1, page);
int offset = pageSize * (page - 1);
int limit = pageSize;

var queryBuilder = new QueryBuilder(index.Analyzer);

var query = string.IsNullOrWhiteSpace(searchText)
? new MatchAllDocsQuery()
: GetTermQuery(queryBuilder, searchText);

var result = luceneIndexService.UseSearcher(
index,
(searcher) =>
{
var topDocs = searcher.Search(query, MAX_RESULTS);
return new LuceneSearchResultModel<DancingGoatCrawlerSearchResultModel>()
{
Query = searchText ?? "",
Page = page,
PageSize = pageSize,
TotalPages = topDocs.TotalHits <= 0 ? 0 : ((topDocs.TotalHits - 1) / pageSize) + 1,
TotalHits = topDocs.TotalHits,
Hits = topDocs.ScoreDocs
.Skip(offset)
.Take(limit)
.Select(d => MapToResultItem(searcher.Doc(d.Doc)))
.ToList(),
};
}
);

return result;
}

private static Query GetTermQuery(QueryBuilder queryBuilder, string searchText)
{
var titlePhrase = queryBuilder.CreatePhraseQuery(nameof(DancingGoatCrawlerSearchModel.Title), searchText, PHRASE_SLOP);
titlePhrase.Boost = 5;
var contentPhrase = queryBuilder.CreatePhraseQuery(nameof(DancingGoatCrawlerSearchModel.CrawlerContent), searchText, PHRASE_SLOP);
contentPhrase.Boost = 1;
var titleShould = queryBuilder.CreateBooleanQuery(nameof(DancingGoatCrawlerSearchModel.CrawlerContent), searchText, Occur.SHOULD);
titleShould.Boost = 0.5f;
var contentShould = queryBuilder.CreateBooleanQuery(nameof(DancingGoatCrawlerSearchModel.CrawlerContent), searchText, Occur.SHOULD);
contentShould.Boost = 0.1f;

return new BooleanQuery
{
{ titlePhrase, Occur.SHOULD },
{ contentPhrase, Occur.SHOULD },
{ titleShould, Occur.SHOULD },
{ contentShould, Occur.SHOULD },
};
}

private DancingGoatCrawlerSearchResultModel MapToResultItem(Document doc) => new()
{
Title = doc.Get(nameof(DancingGoatCrawlerSearchModel.Title)),
Url = doc.Get(nameof(DancingGoatCrawlerSearchModel.Url)),
ContentType = doc.Get(nameof(DancingGoatCrawlerSearchModel.ClassName)),
};
}
37 changes: 37 additions & 0 deletions src/Kentico.Xperience.Lucene.Sample/Search/WebCrawlerService.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
using CMS.DocumentEngine;
using Kentico.Content.Web.Mvc;
using Microsoft.Net.Http.Headers;

namespace DancingGoat.Search;

public class WebCrawlerService
{
private readonly HttpClient httpClient;
private readonly IPageUrlRetriever urlRetriever;

[System.Diagnostics.CodeAnalysis.SuppressMessage("Major Code Smell", "S125:Sections of code should not be commented out", Justification = "Comments contain possible alternative solutions")]
public WebCrawlerService(HttpClient httpClient, IPageUrlRetriever urlRetriever)
{
this.httpClient = httpClient;
// configure the client inside constructor if needed (add custom headers etc.)
this.httpClient.DefaultRequestHeaders.Add(HeaderNames.UserAgent, "SearchCrawler");
this.httpClient.BaseAddress = new Uri(DocumentURLProvider.GetDomainUrl("DancingGoatCore"));
// alternatively specify custom url or load it from settings
// this.httpClient.BaseAddress = new Uri("http://localhost:41489/");

this.urlRetriever = urlRetriever;
}

public async Task<string> CrawlNode(TreeNode node)
{
string url = urlRetriever.Retrieve(node).RelativePath.TrimStart('~');
// urlRetriever.Retrieve(node).AbsolutePath and no BaseAddress could be used as an alternative
return await CrawlPage(url);
}

public async Task<string> CrawlPage(string url)
{
var response = await httpClient.GetAsync(url);
return await response.Content.ReadAsStringAsync();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,18 @@ public virtual string SanitizeHtmlDocument(string htmlContent)
element.Remove();
}

// Removes header
foreach (var element in body.QuerySelectorAll("header"))
{
element.Remove();
}

// Removes footer
foreach (var element in body.QuerySelectorAll(".footer-wrapper"))
{
element.Remove();
}

// Gets the text content of the body element
string textContent = body.TextContent;

Expand Down
100 changes: 100 additions & 0 deletions src/Kentico.Xperience.Lucene.Sample/Views/CrawlerSearch/Index.cshtml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
@using Kentico.Xperience.Lucene.Models
@using DancingGoat.Search

@model LuceneSearchResultModel<DancingGoatCrawlerSearchResultModel>

@{
Dictionary<string, string> GetRouteData(int page) =>
new Dictionary<string, string>() { { "searchText", Model.Query }, { "page", page.ToString() } };
}

<h1>Search</h1>

<style>
.form-field {
margin-bottom: 0.8rem;
}
</style>

<div class="row" style="padding: 1rem;">
<div class="col-12">
<form asp-action="Index" method="get">
<div class="row">
<div class="col-md-12">
<div class="form-field">
<label class="control-label" asp-for="@Model.Query"></label>
<div class="editing-form-control-nested-control">
<input class="form-control" asp-for="@Model.Query">
<input type="hidden" asp-for="@Model.Page" />
<input type="hidden" asp-for="@Model.PageSize" />
</div>
</div>
</div>
</div>

<input type="submit" value="Submit">
</form>
</div>
</div>

@if (!Model.Hits.Any())
{
if (!String.IsNullOrWhiteSpace(Model.Query))
{
@HtmlLocalizer["Sorry, no results match {0}", Model.Query]
}

return;
}

@foreach (var item in Model.Hits)
{
<div class="row search-tile">
<div class="col-md-12 col-lg-12 search-tile-content">
<h3 class="h4 search-tile-title">
<a href="@item.Url">@item.Title</a>
</h3>
@item.ContentType
</div>
</div>
}

<div class="pagination-container">
<ul class="pagination">
@if (Model.Page > 1)
{
<li class="PagedList-skipToPrevious">
<a asp-controller="Search" asp-all-route-data="GetRouteData(Model.Page - 1)">
@HtmlLocalizer["previous"]
</a>
</li>
}

@for (int pageNumber = 1; pageNumber <= Model.TotalPages; pageNumber++)
{
if (pageNumber == Model.Page)
{
<li class="active">
<span>
@pageNumber
</span>
</li>
}
else
{
<li>
<a asp-controller="Search" asp-all-route-data="GetRouteData(pageNumber)">@pageNumber</a>
</li>
}
}

@if (Model.Page < Model.TotalPages)
{
<li class="PagedList-skipToNext">
<a asp-controller="Search" asp-all-route-data="GetRouteData(Model.Page + 1)">
@HtmlLocalizer["next"]
</a>
</li>
}
</ul>
</div>

0 comments on commit cca72c8

Please sign in to comment.