Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lucene decay #15

Merged
merged 2 commits into from
Sep 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,17 @@ dotnet add package Kentico.Xperience.Lucene
- Read the Lucene.NET [introduction](https://lucenenet.apache.org/) or [full documentation](https://lucenenet.apache.org/docs/4.8.0-beta00016/) to explore the core library's APIs and functionality.
- Explore the [Lucene.NET source on GitHub](https://github.com/apache/lucenenet)

### Implementing document decay feature (scoring by "freshness", "recency")
seangwright marked this conversation as resolved.
Show resolved Hide resolved

1) boosting relevant fields by setting field boost (preferable method, but requires more work)
2) boosting one field with constant value, that is always present in search query (shown in sample, less desirable method. Downside of this method is that all documents get matched, usable only for scenarios where total number of result is not required)
3) using sort expression, implementation details can be found in Lucene.NET unit tests, Lucene.NET implementations

Methods 1 and 2 require implementing `DefaultLuceneIndexingStrategy` and overriding `OnDocumentAddField` method.
In `OnDocumentAddField` match required fields and calculate boost, then apply to desired files as shown in example `DancingGoatLuceneIndexingStrategy.OnDocumentAddField`

> differences too small in boosts will be ignored by Lucene

## Sample features

### Trigger rebuild of index via webhook
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using Kentico.Xperience.Lucene.Models;
using Kentico.Xperience.Lucene.Services.Implementations;
using Lucene.Net.Documents;
using Lucene.Net.Index;

namespace DancingGoat.Search;

Expand Down Expand Up @@ -46,6 +47,9 @@ public class DancingGoatLuceneIndexingStrategy : DefaultLuceneIndexingStrategy
nameof(Article.ArticleText),
};

// pick this date as some meaningful start date, if range of dates is too broad (for example historic dates, change decay algorithm instead)
seangwright marked this conversation as resolved.
Show resolved Hide resolved
private static readonly DateTime decayStartDate = new(2020, 1, 1, 0, 0, 0, DateTimeKind.Unspecified);

public override Task<object> OnIndexingProperty(TreeNode node, string propertyName, string usedColumn, object foundValue)
{
object result = foundValue;
Expand Down Expand Up @@ -91,4 +95,31 @@ private static string GetMediaURL(Article article)
? url[1..].ToString()
: url.ToString();
}

public override void OnDocumentAddField(Document document, IIndexableField field)
{
if (field.Name == nameof(DancingGoatSearchModel.PublishedDateTicks) && field.GetInt64Value() is {} unixTimestampMs)
{
var dt = new DateTime(DateTools.UnixTimeMillisecondsToTicks(unixTimestampMs), DateTimeKind.Unspecified);

// difference from first meaningful date in searched data history
var delta = dt.Subtract(new DateTime(decayStartDate.Year, 1, 1, 0, 0, 0, DateTimeKind.Unspecified));

// decay defined as years from decay start date
float decay = (float)delta.TotalDays / 365f;

// boosting by in particular year by term occurence
string value = string.Join(" ", Enumerable.Range(1, dt.Month).Select(x=> 'q'));
var decayField = new TextField("$decay", value, Field.Store.NO)
{
Boost = decay
};

// to avoid showing irrelevant search results, field boosting can be done on existing fields (in that way no additional field is need in index and query)

document.Add(decayField);
}

base.OnDocumentAddField(document, field);
}
}
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
using System.Diagnostics;
using Kentico.Xperience.Lucene;
using Kentico.Xperience.Lucene.Models;
using Kentico.Xperience.Lucene.Services;
Expand Down Expand Up @@ -27,19 +28,30 @@ public LuceneSearchResultModel<DancingGoatSearchModel> Search(string searchText,
var queryBuilder = new QueryBuilder(index.Analyzer);

var query = string.IsNullOrWhiteSpace(searchText)
? new MatchAllDocsQuery()
? GetDefaultQuery(queryBuilder)
: GetTermQuery(queryBuilder, searchText);

var result = luceneIndexService.UseSearcher(
index,
(searcher) =>
{
var topDocs = searcher.Search(query, MAX_RESULTS,
new Sort(new SortField(
nameof(DancingGoatSearchModel.PublishedDateTicks),
FieldCache.NUMERIC_UTILS_INT64_PARSER,
true)));

var topDocs = searcher.Search(query, MAX_RESULTS
// uncomment if sort by score is not desirable
// ,
// new Sort(new SortField(
// nameof(DancingGoatSearchModel.PublishedDateTicks),
// FieldCache.NUMERIC_UTILS_INT64_PARSER,
// true))
);
foreach (var scoreDoc in topDocs.ScoreDocs
.Skip(offset)
.Take(limit))
{
var explanation = searcher.Explain(query, scoreDoc.Doc);
Trace.WriteLine(explanation);
}


return new LuceneSearchResultModel<DancingGoatSearchModel>()
{
Query = searchText ?? "",
Expand Down Expand Up @@ -73,6 +85,18 @@ private static Query GetDateRangeQuery()
true);
}

private static Query GetDefaultQuery(QueryBuilder queryBuilder)
{
// decay query, SHALL BE defined in queries where we require scoring by decay
var decay = queryBuilder.CreateBooleanQuery("$decay", "q", Occur.SHOULD);
decay.Boost = 0.01f;

return new BooleanQuery
{
{ decay, Occur.SHOULD }
};
}

private static Query GetTermQuery(QueryBuilder queryBuilder, string searchText)
{
var titlePhrase = queryBuilder.CreatePhraseQuery(nameof(DancingGoatSearchModel.Title), searchText, PHRASE_SLOP);
Expand All @@ -85,9 +109,14 @@ private static Query GetTermQuery(QueryBuilder queryBuilder, string searchText)
titleShould.Boost = 0.5f;
var contentShould = queryBuilder.CreateBooleanQuery(nameof(DancingGoatSearchModel.AllContent), searchText, Occur.SHOULD);
contentShould.Boost = 0.1f;

// decay query, SHALL BE defined in queries where we require scoring by decay
var decay = queryBuilder.CreateBooleanQuery("$decay", "q", Occur.SHOULD);
decay.Boost = 0.01f;

return new BooleanQuery
{
{ decay, Occur.SHOULD },
{ titlePhrase, Occur.SHOULD },
{ summaryPhrase, Occur.SHOULD },
{ contentPhrase, Occur.SHOULD },
Expand Down
14 changes: 13 additions & 1 deletion src/Kentico.Xperience.Lucene/Services/ILuceneIndexingStrategy.cs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
using CMS.DocumentEngine;
using Kentico.Xperience.Lucene.Attributes;
using Kentico.Xperience.Lucene.Models;
using Lucene.Net.Documents;
using Lucene.Net.Facet;
using Lucene.Net.Index;

namespace Kentico.Xperience.Lucene.Services;

Expand Down Expand Up @@ -38,6 +40,16 @@ public interface ILuceneIndexingStrategy
/// <returns>bool</returns>
bool ShouldIndexNode(TreeNode node);


/// <summary>
/// When overriden and configuration supplied, indexing will also create taxonomy index for facet search
seangwright marked this conversation as resolved.
Show resolved Hide resolved
/// </summary>
/// <returns></returns>
FacetsConfig? FacetsConfigFactory();

/// <summary>
/// Called when field is added to document
/// </summary>
/// <param name="document">indexed document</param>
/// <param name="field">indexed field</param>
void OnDocumentAddField(Document document, IIndexableField field);
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
using CMS.DocumentEngine;
using Kentico.Xperience.Lucene.Models;
using Lucene.Net.Documents;
using Lucene.Net.Facet;
using Lucene.Net.Index;

namespace Kentico.Xperience.Lucene.Services.Implementations;

Expand All @@ -20,4 +22,6 @@ public class DefaultLuceneIndexingStrategy : ILuceneIndexingStrategy

/// <inheritdoc />
public virtual FacetsConfig? FacetsConfigFactory() => null;

public virtual void OnDocumentAddField(Document document, IIndexableField field) => document.Add(field);
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using Kentico.Xperience.Lucene.Models;
using Lucene.Net.Documents;
using Lucene.Net.Facet;
using Lucene.Net.Index;
using Newtonsoft.Json;

namespace Kentico.Xperience.Lucene.Services.Implementations;
Expand Down Expand Up @@ -38,39 +39,45 @@ protected void MapModelProps(LuceneIndex luceneIndex, LuceneSearchModel model, D
{
// use reflcection to get attributes only onnce in static code
object? val = prop.GetValue(model);
IIndexableField? field = null;
if (val != null)
{
if (Attribute.IsDefined(prop, typeof(TextFieldAttribute)))
{
var textFieldAttribute = prop.GetCustomAttributes<TextFieldAttribute>(false).FirstOrDefault();
document.Add(new TextField(prop.Name, val?.ToString(), GetStoreFromAttribute(textFieldAttribute)));
field = new TextField(prop.Name, val?.ToString(), GetStoreFromAttribute(textFieldAttribute));
}
if (Attribute.IsDefined(prop, typeof(StringFieldAttribute)))
{
var stringFieldAttribute = prop.GetCustomAttributes<StringFieldAttribute>(false).FirstOrDefault();
document.Add(new StringField(prop.Name, val?.ToString(), GetStoreFromAttribute(stringFieldAttribute)));
field = new StringField(prop.Name, val?.ToString(), GetStoreFromAttribute(stringFieldAttribute));
}
else if (Attribute.IsDefined(prop, typeof(Int32FieldAttribute)))
{
var intFieldAttribute = prop.GetCustomAttributes<Int32FieldAttribute>(false).FirstOrDefault();
document.Add(new Int32Field(prop.Name, (int?)val ?? 0, stored: GetStoreFromAttribute(intFieldAttribute)));
field = new Int32Field(prop.Name, (int?)val ?? 0, stored: GetStoreFromAttribute(intFieldAttribute));
}
else if (Attribute.IsDefined(prop, typeof(Int64FieldAttribute)))
{
var intFieldAttribute = prop.GetCustomAttributes<Int64FieldAttribute>(false).FirstOrDefault();
document.Add(new Int64Field(prop.Name, (long?)val ?? 0, GetStoreFromAttribute(intFieldAttribute)));
field = new Int64Field(prop.Name, (long?)val ?? 0, GetStoreFromAttribute(intFieldAttribute));
}
else if (Attribute.IsDefined(prop, typeof(SingleFieldAttribute)))
{
var intFieldAttribute = prop.GetCustomAttributes<SingleFieldAttribute>(false).FirstOrDefault();
document.Add(new SingleField(prop.Name, (float?)val ?? 0, GetStoreFromAttribute(intFieldAttribute)));
field = new SingleField(prop.Name, (float?)val ?? 0, GetStoreFromAttribute(intFieldAttribute));
}
else if (Attribute.IsDefined(prop, typeof(DoubleFieldAttribute)))
{
var intFieldAttribute = prop.GetCustomAttributes<DoubleFieldAttribute>(false).FirstOrDefault();
document.Add(new DoubleField(prop.Name, (double?)val ?? 0, GetStoreFromAttribute(intFieldAttribute)));
field = new DoubleField(prop.Name, (double?)val ?? 0, GetStoreFromAttribute(intFieldAttribute));
}
}

if (field != null)
{
luceneIndex.LuceneIndexingStrategy.OnDocumentAddField(document, field);
}
}
catch (Exception ex)
{
Expand Down
Loading