Skip to content

Commit

Permalink
Merge pull request #15 from Kentico/TK/lucene_decay
Browse files Browse the repository at this point in the history
lucene decay
  • Loading branch information
seangwright authored Sep 7, 2023
2 parents df6c62f + f5f0bfa commit 3dfffbb
Show file tree
Hide file tree
Showing 6 changed files with 108 additions and 14 deletions.
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,17 @@ dotnet add package Kentico.Xperience.Lucene
- Read the Lucene.NET [introduction](https://lucenenet.apache.org/) or [full documentation](https://lucenenet.apache.org/docs/4.8.0-beta00016/) to explore the core library's APIs and functionality.
- Explore the [Lucene.NET source on GitHub](https://github.com/apache/lucenenet)
### Implementing document decay feature (scoring by "freshness", "recency")

1) boosting relevant fields by setting field boost (preferable method, but requires more work)
2) boosting one field with constant value, that is always present in search query (shown in sample, less desirable method. Downside of this method is that all documents get matched, usable only for scenarios where total number of result is not required)
3) using sort expression, implementation details can be found in Lucene.NET unit tests, Lucene.NET implementations

Methods 1 and 2 require implementing `DefaultLuceneIndexingStrategy` and overriding `OnDocumentAddField` method.
In `OnDocumentAddField` match required fields and calculate boost, then apply to desired files as shown in example `DancingGoatLuceneIndexingStrategy.OnDocumentAddField`

> differences too small in boosts will be ignored by Lucene

## Sample features

### Trigger rebuild of index via webhook
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using Kentico.Xperience.Lucene.Models;
using Kentico.Xperience.Lucene.Services.Implementations;
using Lucene.Net.Documents;
using Lucene.Net.Index;

namespace DancingGoat.Search;

Expand Down Expand Up @@ -46,6 +47,9 @@ public class DancingGoatLuceneIndexingStrategy : DefaultLuceneIndexingStrategy
nameof(Article.ArticleText),
};

// pick this date as some meaningful start date, if range of dates is too broad (for example historic dates, change decay algorithm instead)
private static readonly DateTime decayStartDate = new(2020, 1, 1, 0, 0, 0, DateTimeKind.Unspecified);

public override Task<object> OnIndexingProperty(TreeNode node, string propertyName, string usedColumn, object foundValue)
{
object result = foundValue;
Expand Down Expand Up @@ -91,4 +95,31 @@ private static string GetMediaURL(Article article)
? url[1..].ToString()
: url.ToString();
}

public override void OnDocumentAddField(Document document, IIndexableField field)
{
if (field.Name == nameof(DancingGoatSearchModel.PublishedDateTicks) && field.GetInt64Value() is {} unixTimestampMs)
{
var dt = new DateTime(DateTools.UnixTimeMillisecondsToTicks(unixTimestampMs), DateTimeKind.Unspecified);

// difference from first meaningful date in searched data history
var delta = dt.Subtract(new DateTime(decayStartDate.Year, 1, 1, 0, 0, 0, DateTimeKind.Unspecified));

// decay defined as years from decay start date
float decay = (float)delta.TotalDays / 365f;

// boosting by in particular year by term occurence
string value = string.Join(" ", Enumerable.Range(1, dt.Month).Select(x=> 'q'));
var decayField = new TextField("$decay", value, Field.Store.NO)
{
Boost = decay
};

// to avoid showing irrelevant search results, field boosting can be done on existing fields (in that way no additional field is need in index and query)

document.Add(decayField);
}

base.OnDocumentAddField(document, field);
}
}
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
using System.Diagnostics;
using Kentico.Xperience.Lucene;
using Kentico.Xperience.Lucene.Models;
using Kentico.Xperience.Lucene.Services;
Expand Down Expand Up @@ -27,19 +28,30 @@ public LuceneSearchResultModel<DancingGoatSearchModel> Search(string searchText,
var queryBuilder = new QueryBuilder(index.Analyzer);

var query = string.IsNullOrWhiteSpace(searchText)
? new MatchAllDocsQuery()
? GetDefaultQuery(queryBuilder)
: GetTermQuery(queryBuilder, searchText);

var result = luceneIndexService.UseSearcher(
index,
(searcher) =>
{
var topDocs = searcher.Search(query, MAX_RESULTS,
new Sort(new SortField(
nameof(DancingGoatSearchModel.PublishedDateTicks),
FieldCache.NUMERIC_UTILS_INT64_PARSER,
true)));
var topDocs = searcher.Search(query, MAX_RESULTS
// uncomment if sort by score is not desirable
// ,
// new Sort(new SortField(
// nameof(DancingGoatSearchModel.PublishedDateTicks),
// FieldCache.NUMERIC_UTILS_INT64_PARSER,
// true))
);
foreach (var scoreDoc in topDocs.ScoreDocs
.Skip(offset)
.Take(limit))
{
var explanation = searcher.Explain(query, scoreDoc.Doc);
Trace.WriteLine(explanation);
}
return new LuceneSearchResultModel<DancingGoatSearchModel>()
{
Query = searchText ?? "",
Expand Down Expand Up @@ -73,6 +85,18 @@ private static Query GetDateRangeQuery()
true);
}

private static Query GetDefaultQuery(QueryBuilder queryBuilder)
{
// decay query, SHALL BE defined in queries where we require scoring by decay
var decay = queryBuilder.CreateBooleanQuery("$decay", "q", Occur.SHOULD);
decay.Boost = 0.01f;

return new BooleanQuery
{
{ decay, Occur.SHOULD }
};
}

private static Query GetTermQuery(QueryBuilder queryBuilder, string searchText)
{
var titlePhrase = queryBuilder.CreatePhraseQuery(nameof(DancingGoatSearchModel.Title), searchText, PHRASE_SLOP);
Expand All @@ -85,9 +109,14 @@ private static Query GetTermQuery(QueryBuilder queryBuilder, string searchText)
titleShould.Boost = 0.5f;
var contentShould = queryBuilder.CreateBooleanQuery(nameof(DancingGoatSearchModel.AllContent), searchText, Occur.SHOULD);
contentShould.Boost = 0.1f;

// decay query, SHALL BE defined in queries where we require scoring by decay
var decay = queryBuilder.CreateBooleanQuery("$decay", "q", Occur.SHOULD);
decay.Boost = 0.01f;

return new BooleanQuery
{
{ decay, Occur.SHOULD },
{ titlePhrase, Occur.SHOULD },
{ summaryPhrase, Occur.SHOULD },
{ contentPhrase, Occur.SHOULD },
Expand Down
14 changes: 13 additions & 1 deletion src/Kentico.Xperience.Lucene/Services/ILuceneIndexingStrategy.cs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
using CMS.DocumentEngine;
using Kentico.Xperience.Lucene.Attributes;
using Kentico.Xperience.Lucene.Models;
using Lucene.Net.Documents;
using Lucene.Net.Facet;
using Lucene.Net.Index;

namespace Kentico.Xperience.Lucene.Services;

Expand Down Expand Up @@ -38,6 +40,16 @@ public interface ILuceneIndexingStrategy
/// <returns>bool</returns>
bool ShouldIndexNode(TreeNode node);


/// <summary>
/// When overriden and configuration supplied, indexing will also create taxonomy index for facet search
/// </summary>
/// <returns></returns>
FacetsConfig? FacetsConfigFactory();

/// <summary>
/// Called when field is added to document
/// </summary>
/// <param name="document">indexed document</param>
/// <param name="field">indexed field</param>
void OnDocumentAddField(Document document, IIndexableField field);
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
using CMS.DocumentEngine;
using Kentico.Xperience.Lucene.Models;
using Lucene.Net.Documents;
using Lucene.Net.Facet;
using Lucene.Net.Index;

namespace Kentico.Xperience.Lucene.Services.Implementations;

Expand All @@ -20,4 +22,6 @@ public class DefaultLuceneIndexingStrategy : ILuceneIndexingStrategy

/// <inheritdoc />
public virtual FacetsConfig? FacetsConfigFactory() => null;

public virtual void OnDocumentAddField(Document document, IIndexableField field) => document.Add(field);
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using Kentico.Xperience.Lucene.Models;
using Lucene.Net.Documents;
using Lucene.Net.Facet;
using Lucene.Net.Index;
using Newtonsoft.Json;

namespace Kentico.Xperience.Lucene.Services.Implementations;
Expand Down Expand Up @@ -38,39 +39,45 @@ protected void MapModelProps(LuceneIndex luceneIndex, LuceneSearchModel model, D
{
// use reflcection to get attributes only onnce in static code
object? val = prop.GetValue(model);
IIndexableField? field = null;
if (val != null)
{
if (Attribute.IsDefined(prop, typeof(TextFieldAttribute)))
{
var textFieldAttribute = prop.GetCustomAttributes<TextFieldAttribute>(false).FirstOrDefault();
document.Add(new TextField(prop.Name, val?.ToString(), GetStoreFromAttribute(textFieldAttribute)));
field = new TextField(prop.Name, val?.ToString(), GetStoreFromAttribute(textFieldAttribute));
}
if (Attribute.IsDefined(prop, typeof(StringFieldAttribute)))
{
var stringFieldAttribute = prop.GetCustomAttributes<StringFieldAttribute>(false).FirstOrDefault();
document.Add(new StringField(prop.Name, val?.ToString(), GetStoreFromAttribute(stringFieldAttribute)));
field = new StringField(prop.Name, val?.ToString(), GetStoreFromAttribute(stringFieldAttribute));
}
else if (Attribute.IsDefined(prop, typeof(Int32FieldAttribute)))
{
var intFieldAttribute = prop.GetCustomAttributes<Int32FieldAttribute>(false).FirstOrDefault();
document.Add(new Int32Field(prop.Name, (int?)val ?? 0, stored: GetStoreFromAttribute(intFieldAttribute)));
field = new Int32Field(prop.Name, (int?)val ?? 0, stored: GetStoreFromAttribute(intFieldAttribute));
}
else if (Attribute.IsDefined(prop, typeof(Int64FieldAttribute)))
{
var intFieldAttribute = prop.GetCustomAttributes<Int64FieldAttribute>(false).FirstOrDefault();
document.Add(new Int64Field(prop.Name, (long?)val ?? 0, GetStoreFromAttribute(intFieldAttribute)));
field = new Int64Field(prop.Name, (long?)val ?? 0, GetStoreFromAttribute(intFieldAttribute));
}
else if (Attribute.IsDefined(prop, typeof(SingleFieldAttribute)))
{
var intFieldAttribute = prop.GetCustomAttributes<SingleFieldAttribute>(false).FirstOrDefault();
document.Add(new SingleField(prop.Name, (float?)val ?? 0, GetStoreFromAttribute(intFieldAttribute)));
field = new SingleField(prop.Name, (float?)val ?? 0, GetStoreFromAttribute(intFieldAttribute));
}
else if (Attribute.IsDefined(prop, typeof(DoubleFieldAttribute)))
{
var intFieldAttribute = prop.GetCustomAttributes<DoubleFieldAttribute>(false).FirstOrDefault();
document.Add(new DoubleField(prop.Name, (double?)val ?? 0, GetStoreFromAttribute(intFieldAttribute)));
field = new DoubleField(prop.Name, (double?)val ?? 0, GetStoreFromAttribute(intFieldAttribute));
}
}

if (field != null)
{
luceneIndex.LuceneIndexingStrategy.OnDocumentAddField(document, field);
}
}
catch (Exception ex)
{
Expand Down

0 comments on commit 3dfffbb

Please sign in to comment.