Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#178 - week-number based date extraction patterns for titles #190

Merged
merged 6 commits into from
Jan 14, 2025
96 changes: 89 additions & 7 deletions src/custom-sort/matchers.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
import {
getDateForWeekOfYear
} from "../utils/week-of-year";

export const RomanNumberRegexStr: string = ' *([MDCLXVI]+)'; // Roman number
export const CompoundRomanNumberDotRegexStr: string = ' *([MDCLXVI]+(?:\\.[MDCLXVI]+)*)';// Compound Roman number with dot as separator
export const CompoundRomanNumberDashRegexStr: string = ' *([MDCLXVI]+(?:-[MDCLXVI]+)*)'; // Compound Roman number with dash as separator
Expand All @@ -6,15 +10,26 @@ export const NumberRegexStr: string = ' *(\\d+)'; // Plain number
export const CompoundNumberDotRegexStr: string = ' *(\\d+(?:\\.\\d+)*)'; // Compound number with dot as separator
export const CompoundNumberDashRegexStr: string = ' *(\\d+(?:-\\d+)*)'; // Compound number with dash as separator

export const Date_yyyy_mm_dd_RegexStr: string = ' *(\\d{4}-[0-3]*[0-9]-[0-3]*[0-9])'
export const Date_yyyy_dd_mm_RegexStr: string = Date_yyyy_mm_dd_RegexStr

export const Date_dd_Mmm_yyyy_RegexStr: string = ' *([0-3]*[0-9]-(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)-\\d{4})'; // Date like 01-Jan-2020
export const Date_Mmm_dd_yyyy_RegexStr: string = ' *((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)-[0-3]*[0-9]-\\d{4})'; // Date like Jan-01-2020

export const DOT_SEPARATOR = '.'
export const Date_yyyy_Www_mm_dd_RegexStr: string = ' *(\\d{4}-W[0-5]*[0-9] \\([0-3]*[0-9]-[0-3]*[0-9]\\))'
export const Date_yyyy_WwwISO_RegexStr: string = ' *(\\d{4}-W[0-5]*[0-9][-+]?)'
export const Date_yyyy_Www_RegexStr: string = Date_yyyy_WwwISO_RegexStr

export const DOT_SEPARATOR = '.' // ASCII 46
export const DASH_SEPARATOR = '-'

const SLASH_SEPARATOR = '/' // ASCII 47
const SLASH_SEPARATOR = '/' // ASCII 47, right before ASCII 48 = '0'
const GT_SEPARATOR = '>' // ASCII 62, alphabetical sorting in Collator puts it after /
const PIPE_SEPARATOR = '|' // ASCII 124

const EARLIER_THAN_SLASH_SEPARATOR = DOT_SEPARATOR
const LATER_THAN_SLASH_SEPARATOR = GT_SEPARATOR

export const DEFAULT_NORMALIZATION_PLACES = 8; // Fixed width of a normalized number (with leading zeros)

// Property escapes:
Expand Down Expand Up @@ -51,9 +66,9 @@ export function getNormalizedNumber(s: string = '', separator?: string, places?:
// guarantees correct order (/ = ASCII 47, | = ASCII 124)
if (separator) {
const components: Array<string> = s.split(separator).filter(s => s)
return `${components.map((c) => prependWithZeros(c, places ?? DEFAULT_NORMALIZATION_PLACES)).join(PIPE_SEPARATOR)}//`
return `${components.map((c) => prependWithZeros(c, places ?? DEFAULT_NORMALIZATION_PLACES)).join(PIPE_SEPARATOR)}${SLASH_SEPARATOR}${SLASH_SEPARATOR}`
} else {
return `${prependWithZeros(s, places ?? DEFAULT_NORMALIZATION_PLACES)}//`
return `${prependWithZeros(s, places ?? DEFAULT_NORMALIZATION_PLACES)}${SLASH_SEPARATOR}${SLASH_SEPARATOR}`
}
}

Expand Down Expand Up @@ -97,9 +112,9 @@ export function getNormalizedRomanNumber(s: string, separator?: string, places?:
// guarantees correct order (/ = ASCII 47, | = ASCII 124)
if (separator) {
const components: Array<string> = s.split(separator).filter(s => s)
return `${components.map((c) => prependWithZeros(romanToIntStr(c), places ?? DEFAULT_NORMALIZATION_PLACES)).join(PIPE_SEPARATOR)}//`
return `${components.map((c) => prependWithZeros(romanToIntStr(c), places ?? DEFAULT_NORMALIZATION_PLACES)).join(PIPE_SEPARATOR)}${SLASH_SEPARATOR}${SLASH_SEPARATOR}`
} else {
return `${prependWithZeros(romanToIntStr(s), places ?? DEFAULT_NORMALIZATION_PLACES)}//`
return `${prependWithZeros(romanToIntStr(s), places ?? DEFAULT_NORMALIZATION_PLACES)}${SLASH_SEPARATOR}${SLASH_SEPARATOR}`
}
}

Expand All @@ -117,9 +132,76 @@ export function getNormalizedDate_NormalizerFn_for(separator: string, dayIdx: nu
const monthValue = months ? `${1 + MONTHS.indexOf(components[monthIdx])}` : components[monthIdx]
const month = prependWithZeros(monthValue, MONTH_POSITIONS)
const year = prependWithZeros(components[yearIdx], YEAR_POSITIONS)
return `${year}-${month}-${day}//`
return `${year}-${month}-${day}${SLASH_SEPARATOR}${SLASH_SEPARATOR}`
}
}

export const getNormalizedDate_yyyy_mm_dd_NormalizerFn = getNormalizedDate_NormalizerFn_for('-', 2, 1, 0)
export const getNormalizedDate_yyyy_dd_mm_NormalizerFn = getNormalizedDate_NormalizerFn_for('-', 1, 2, 0)
export const getNormalizedDate_dd_Mmm_yyyy_NormalizerFn = getNormalizedDate_NormalizerFn_for('-', 0, 1, 2, MONTHS)
export const getNormalizedDate_Mmm_dd_yyyy_NormalizerFn = getNormalizedDate_NormalizerFn_for('-', 1, 0, 2, MONTHS)

const DateExtractor_orderModifier_earlier_than = '-'
const DateExtractor_orderModifier_later_than = '+'

const DateExtractor_yyyy_Www_mm_dd_Regex = /(\d{4})-W(\d{1,2}) \((\d{2})-(\d{2})\)/
const DateExtractor_yyyy_Www_Regex = /(\d{4})-W(\d{1,2})([-+]?)/

// Matching groups
const YEAR_IDX = 1
const WEEK_IDX = 2
const MONTH_IDX = 3
const DAY_IDX = 4
const RELATIVE_ORDER_IDX = 3 // For the yyyy-Www only: yyyy-Www- or yyyy-Www+

const DECEMBER = 12
const JANUARY = 1

export function getNormalizedDate_NormalizerFn_yyyy_Www_mm_dd(consumeWeek: boolean, weeksISO?: boolean) {
return (s: string): string | null => {
// Assumption - the regex date matched against input s, no extensive defensive coding needed
const matches = consumeWeek ? DateExtractor_yyyy_Www_Regex.exec(s) : DateExtractor_yyyy_Www_mm_dd_Regex.exec(s)
const yearStr = matches![YEAR_IDX]
let yearNumber = Number.parseInt(yearStr,10)
let monthNumber: number
let dayNumber: number
let separator = SLASH_SEPARATOR // different values enforce relative > < order of same dates
let useLastDayOfWeek: boolean = false
if (consumeWeek) {
const weekNumberStr = matches![WEEK_IDX]
const weekNumber = Number.parseInt(weekNumberStr, 10)
const orderModifier: string|undefined = matches![RELATIVE_ORDER_IDX]
if (orderModifier === DateExtractor_orderModifier_earlier_than) {
separator = EARLIER_THAN_SLASH_SEPARATOR
} else if (orderModifier === DateExtractor_orderModifier_later_than) {
separator = LATER_THAN_SLASH_SEPARATOR // Will also need to adjust the date to the last day of the week
useLastDayOfWeek = true
}
const dateForWeek = getDateForWeekOfYear(yearNumber, weekNumber, weeksISO, useLastDayOfWeek)
monthNumber = dateForWeek.getMonth()+1 // 1 - 12
dayNumber = dateForWeek.getDate() // 1 - 31
// Be careful with edge dates, which can belong to previous or next year
if (weekNumber === 1) {
if (monthNumber === DECEMBER) {
yearNumber--
}
}
if (weekNumber >= 50) {
if (monthNumber === JANUARY) {
yearNumber++
}
}
} else { // ignore week
monthNumber = Number.parseInt(matches![MONTH_IDX],10)
dayNumber = Number.parseInt(matches![DAY_IDX], 10)
}
return `${prependWithZeros(`${yearNumber}`, YEAR_POSITIONS)}` +
`-${prependWithZeros(`${monthNumber}`, MONTH_POSITIONS)}` +
`-${prependWithZeros(`${dayNumber}`, DAY_POSITIONS)}` +
`${separator}${SLASH_SEPARATOR}`
}
}

export const getNormalizedDate_yyyy_Www_mm_dd_NormalizerFn = getNormalizedDate_NormalizerFn_yyyy_Www_mm_dd(false)
export const getNormalizedDate_yyyy_WwwISO_NormalizerFn = getNormalizedDate_NormalizerFn_yyyy_Www_mm_dd(true, true)
export const getNormalizedDate_yyyy_Www_NormalizerFn = getNormalizedDate_NormalizerFn_yyyy_Www_mm_dd(true, false)
64 changes: 58 additions & 6 deletions src/custom-sort/sorting-spec-processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,19 @@ import {
DASH_SEPARATOR,
Date_dd_Mmm_yyyy_RegexStr,
Date_Mmm_dd_yyyy_RegexStr,
Date_yyyy_dd_mm_RegexStr,
Date_yyyy_mm_dd_RegexStr,
Date_yyyy_Www_mm_dd_RegexStr,
Date_yyyy_Www_RegexStr,
Date_yyyy_WwwISO_RegexStr,
DOT_SEPARATOR,
getNormalizedDate_dd_Mmm_yyyy_NormalizerFn,
getNormalizedDate_Mmm_dd_yyyy_NormalizerFn,
getNormalizedDate_yyyy_dd_mm_NormalizerFn,
getNormalizedDate_yyyy_mm_dd_NormalizerFn,
getNormalizedDate_yyyy_Www_mm_dd_NormalizerFn,
getNormalizedDate_yyyy_Www_NormalizerFn,
getNormalizedDate_yyyy_WwwISO_NormalizerFn,
getNormalizedNumber,
getNormalizedRomanNumber,
NumberRegexStr,
Expand All @@ -36,10 +46,7 @@ import {
MATCH_CHILDREN_2_SUFFIX,
NO_PRIORITY
} from "./folder-matching-rules"
import {
MDataExtractor,
tryParseAsMDataExtractorSpec
} from "./mdata-extractors";
import {MDataExtractor, tryParseAsMDataExtractorSpec} from "./mdata-extractors";

interface ProcessingContext {
folderPath: string
Expand Down Expand Up @@ -352,8 +359,13 @@ const InlineRegexSymbol_Digit1: string = '\\d'
const InlineRegexSymbol_Digit2: string = '\\[0-9]'
const InlineRegexSymbol_0_to_3: string = '\\[0-3]'

const Date_yyyy_mm_dd_RegexSymbol: string = '\\[yyyy-mm-dd]'
const Date_yyyy_dd_mm_RegexSymbol: string = '\\[yyyy-dd-mm]'
const Date_dd_Mmm_yyyy_RegexSymbol: string = '\\[dd-Mmm-yyyy]'
const Date_Mmm_dd_yyyy_RegexSymbol: string = '\\[Mmm-dd-yyyy]'
const Date_yyyy_Www_mm_dd_RegexSymbol: string = '\\[yyyy-Www (mm-dd)]'
const Date_yyyy_Www_RegexSymbol: string = '\\[yyyy-Www]'
const Date_yyyy_WwwISO_RegexSymbol: string = '\\[yyyy-WwwISO]'

const InlineRegexSymbol_CapitalLetter: string = '\\C'
const InlineRegexSymbol_LowercaseLetter: string = '\\l'
Expand All @@ -373,8 +385,13 @@ const sortingSymbolsArr: Array<string> = [
escapeRegexUnsafeCharacters(CompoundRomanNumberDashRegexSymbol),
escapeRegexUnsafeCharacters(WordInASCIIRegexSymbol),
escapeRegexUnsafeCharacters(WordInAnyLanguageRegexSymbol),
escapeRegexUnsafeCharacters(Date_yyyy_mm_dd_RegexSymbol),
escapeRegexUnsafeCharacters(Date_yyyy_dd_mm_RegexSymbol),
escapeRegexUnsafeCharacters(Date_dd_Mmm_yyyy_RegexSymbol),
escapeRegexUnsafeCharacters(Date_Mmm_dd_yyyy_RegexSymbol)
escapeRegexUnsafeCharacters(Date_Mmm_dd_yyyy_RegexSymbol),
escapeRegexUnsafeCharacters(Date_yyyy_Www_mm_dd_RegexSymbol),
escapeRegexUnsafeCharacters(Date_yyyy_WwwISO_RegexSymbol),
escapeRegexUnsafeCharacters(Date_yyyy_Www_RegexSymbol),
]

const sortingSymbolsRegex = new RegExp(sortingSymbolsArr.join('|'), 'gi')
Expand Down Expand Up @@ -442,8 +459,13 @@ export const CompoundDashRomanNumberNormalizerFn: NormalizerFn = (s: string) =>
export const NumberNormalizerFn: NormalizerFn = (s: string) => getNormalizedNumber(s)
export const CompoundDotNumberNormalizerFn: NormalizerFn = (s: string) => getNormalizedNumber(s, DOT_SEPARATOR)
export const CompoundDashNumberNormalizerFn: NormalizerFn = (s: string) => getNormalizedNumber(s, DASH_SEPARATOR)
export const Date_yyyy_mm_dd_NormalizerFn: NormalizerFn = (s: string) => getNormalizedDate_yyyy_mm_dd_NormalizerFn(s)
export const Date_yyyy_dd_mm_NormalizerFn: NormalizerFn = (s: string) => getNormalizedDate_yyyy_dd_mm_NormalizerFn(s)
export const Date_dd_Mmm_yyyy_NormalizerFn: NormalizerFn = (s: string) => getNormalizedDate_dd_Mmm_yyyy_NormalizerFn(s)
export const Date_Mmm_dd_yyyy_NormalizerFn: NormalizerFn = (s: string) => getNormalizedDate_Mmm_dd_yyyy_NormalizerFn(s)
export const Date_yyyy_Www_mm_dd_NormalizerFn: NormalizerFn = (s: string) => getNormalizedDate_yyyy_Www_mm_dd_NormalizerFn(s)
export const Date_yyyy_WwwISO_NormalizerFn: NormalizerFn = (s: string) => getNormalizedDate_yyyy_WwwISO_NormalizerFn(s)
export const Date_yyyy_Www_NormalizerFn: NormalizerFn = (s: string) => getNormalizedDate_yyyy_Www_NormalizerFn(s)

export enum AdvancedRegexType {
None, // to allow if (advancedRegex)
Expand All @@ -455,8 +477,13 @@ export enum AdvancedRegexType {
CompoundDashRomanNumber,
WordInASCII,
WordInAnyLanguage,
Date_yyyy_mm_dd,
Date_yyyy_dd_mm,
Date_dd_Mmm_yyyy,
Date_Mmm_dd_yyyy
Date_Mmm_dd_yyyy,
Date_yyyy_Www_mm_dd_yyyy,
Date_yyyy_WwwISO,
Date_yyyy_Www
}

const sortingSymbolToRegexpStr: { [key: string]: RegExpSpecStr } = {
Expand Down Expand Up @@ -501,6 +528,16 @@ const sortingSymbolToRegexpStr: { [key: string]: RegExpSpecStr } = {
advancedRegexType: AdvancedRegexType.WordInAnyLanguage,
unicodeRegex: true
},
[Date_yyyy_mm_dd_RegexSymbol]: { // Intentionally retain character case
regexpStr: Date_yyyy_mm_dd_RegexStr,
normalizerFn: Date_yyyy_mm_dd_NormalizerFn,
advancedRegexType: AdvancedRegexType.Date_yyyy_mm_dd
},
[Date_yyyy_dd_mm_RegexSymbol]: { // Intentionally retain character case
regexpStr: Date_yyyy_dd_mm_RegexStr,
normalizerFn: Date_yyyy_dd_mm_NormalizerFn,
advancedRegexType: AdvancedRegexType.Date_yyyy_dd_mm
},
[Date_dd_Mmm_yyyy_RegexSymbol]: { // Intentionally retain character case
regexpStr: Date_dd_Mmm_yyyy_RegexStr,
normalizerFn: Date_dd_Mmm_yyyy_NormalizerFn,
Expand All @@ -510,6 +547,21 @@ const sortingSymbolToRegexpStr: { [key: string]: RegExpSpecStr } = {
regexpStr: Date_Mmm_dd_yyyy_RegexStr,
normalizerFn: Date_Mmm_dd_yyyy_NormalizerFn,
advancedRegexType: AdvancedRegexType.Date_Mmm_dd_yyyy
},
[Date_yyyy_Www_mm_dd_RegexSymbol]: { // Intentionally retain character case
regexpStr: Date_yyyy_Www_mm_dd_RegexStr,
normalizerFn: Date_yyyy_Www_mm_dd_NormalizerFn,
advancedRegexType: AdvancedRegexType.Date_yyyy_Www_mm_dd_yyyy
},
[Date_yyyy_WwwISO_RegexSymbol]: { // Intentionally retain character case
regexpStr: Date_yyyy_WwwISO_RegexStr,
normalizerFn: Date_yyyy_WwwISO_NormalizerFn,
advancedRegexType: AdvancedRegexType.Date_yyyy_WwwISO
},
[Date_yyyy_Www_RegexSymbol]: { // Intentionally retain character case
regexpStr: Date_yyyy_Www_RegexStr,
normalizerFn: Date_yyyy_Www_NormalizerFn,
advancedRegexType: AdvancedRegexType.Date_yyyy_Www
}
}

Expand Down
Loading
Loading