Skip to content

Commit

Permalink
Add multi-dims to data catalog
Browse files Browse the repository at this point in the history
  • Loading branch information
rakyi committed Mar 7, 2025
1 parent 9960a4c commit e898c1d
Show file tree
Hide file tree
Showing 14 changed files with 230 additions and 42 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -293,8 +293,8 @@ reindex: itsJustJavascript
node --enable-source-maps itsJustJavascript/baker/algolia/indexChartsToAlgolia.js
@echo '--- Running indexExplorerViewsToAlgolia...'
node --enable-source-maps itsJustJavascript/baker/algolia/indexExplorerViewsToAlgolia.js
@echo '--- Running indexExplorerViewsAndChartsToAlgolia...'
node --enable-source-maps itsJustJavascript/baker/algolia/indexExplorerViewsAndChartsToAlgolia.js
@echo '--- Running indexExplorerViewsMdimViewsAndChartsToAlgolia...'
node --enable-source-maps itsJustJavascript/baker/algolia/indexExplorerViewsMdimViewsAndChartsToAlgolia.js

delete-algolia-index: itsJustJavascript
@echo '==> Deleting Algolia index'
Expand Down
15 changes: 3 additions & 12 deletions adminSiteServer/multiDim.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import {
MultiDimDataPageConfigPreProcessed,
MultiDimDataPageConfigRaw,
MultiDimDataPagesTableName,
MultiDimDimensionChoices,
MultiDimXChartConfigsTableName,
parseChartConfigsRow,
R2GrapherConfigDirectory,
Expand All @@ -28,7 +27,7 @@ import {
import {
mergeGrapherConfigs,
MultiDimDataPageConfig,
slugify,
multiDimDimensionsToViewId,
} from "@ourworldindata/utils"
import * as db from "../db/db.js"
import { upsertMultiDimDataPage } from "../db/model/MultiDimDataPage.js"
Expand All @@ -47,14 +46,6 @@ import {
updateChartConfigInDbAndR2,
} from "./chartConfigHelpers.js"

function dimensionsToViewId(dimensions: MultiDimDimensionChoices) {
return Object.entries(dimensions)
.sort(([keyA], [keyB]) => keyA.localeCompare(keyB))
.map(([_, value]) => slugify(value))
.join("__")
.toLowerCase()
}

function catalogPathFromIndicatorEntry(
entry: IndicatorEntryBeforePreProcessing
): string | undefined {
Expand Down Expand Up @@ -289,7 +280,7 @@ export async function upsertMultiDim(
patchGrapherConfig
)
const existingChartConfigId = existingViewIdsToChartConfigIds.get(
dimensionsToViewId(view.dimensions)
multiDimDimensionsToViewId(view.dimensions)
)
let chartConfigId
if (existingChartConfigId) {
Expand Down Expand Up @@ -330,7 +321,7 @@ export async function upsertMultiDim(
for (const view of enrichedConfig.views) {
await upsertMultiDimXChartConfigs(knex, {
multiDimId,
viewId: dimensionsToViewId(view.dimensions),
viewId: multiDimDimensionsToViewId(view.dimensions),
variableId: view.indicators.y[0].id,
chartConfigId: view.fullConfigId,
})
Expand Down
12 changes: 8 additions & 4 deletions baker/MultiDimBaker.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,14 @@ import {
import { logErrorAndMaybeCaptureInSentry } from "../serverUtils/errorLog.js"
import { getAllPublishedChartSlugs } from "../db/model/Chart.js"
import {
getAllPublishedMultiDimDataPages,
getAllPublishedMultiDimDataPagesBySlug,
getMultiDimDataPageByCatalogPath,
getMultiDimDataPageBySlug,
} from "../db/model/MultiDimDataPage.js"

const getRelevantVariableIds = (config: MultiDimDataPageConfigPreProcessed) => {
export function getRelevantVariableIds(
config: MultiDimDataPageConfigPreProcessed
) {
// A "relevant" variable id is the first y indicator of each view
const allIndicatorIds = config.views
.map((view) => view.indicators.y?.[0]?.id)
Expand All @@ -49,7 +51,9 @@ const getRelevantVariableIds = (config: MultiDimDataPageConfigPreProcessed) => {
return new Set(allIndicatorIds)
}

async function getRelevantVariableMetadata(variableIds: Iterable<number>) {
export async function getRelevantVariableMetadata(
variableIds: Iterable<number>
) {
const metadata = await pMap(
variableIds,
async (id) => {
Expand Down Expand Up @@ -248,7 +252,7 @@ export const bakeAllMultiDimDataPages = async (
bakedSiteDir: string,
imageMetadata: Record<string, ImageMetadata>
) => {
const multiDimsBySlug = await getAllPublishedMultiDimDataPages(knex)
const multiDimsBySlug = await getAllPublishedMultiDimDataPagesBySlug(knex)
const progressBar = new ProgressBar(
"bake multi-dim page [:bar] :current/:total :elapseds :rate/s :name\n",
{
Expand Down
2 changes: 1 addition & 1 deletion baker/algolia/configureAlgolia.ts
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ export const configureAlgolia = async () => {
})

const explorerViewsAndChartsIndex = client.initIndex(
getIndexName(SearchIndexName.ExplorerViewsAndCharts)
getIndexName(SearchIndexName.ExplorerViewsMdimViewsAndCharts)
)

await explorerViewsAndChartsIndex.setSettings({
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,16 @@ import { scaleRecordScores } from "./utils/shared.js"
import { getChartsRecords } from "./utils/charts.js"
import { getIndexName } from "../../site/search/searchClient.js"
import { SearchIndexName } from "../../site/search/searchTypes.js"
import { getMdimViewRecords } from "./utils/mdimViews.js"

// We get 200k operations with Algolia's Open Source plan. We've hit 140k in the past so this might push us over.
// If we standardize the record shape, we could have this be the only index and have a `type` field
// to use in /search.
const indexExplorerViewsAndChartsToAlgolia = async () => {
const indexExplorerViewsMdimViewsAndChartsToAlgolia = async () => {
if (!ALGOLIA_INDEXING) return
const indexName = getIndexName(SearchIndexName.ExplorerViewsAndCharts)
const indexName = getIndexName(
SearchIndexName.ExplorerViewsMdimViewsAndCharts
)
console.log(
`Indexing explorer views and charts to the "${indexName}" index on Algolia`
)
Expand All @@ -31,33 +34,37 @@ const indexExplorerViewsAndChartsToAlgolia = async () => {
)
}

const { explorerViews, grapherViews } = await db.knexReadonlyTransaction(
async (trx) => {
const { explorerViews, mdimViews, grapherViews } =
await db.knexReadonlyTransaction(async (trx) => {
return {
explorerViews: await getExplorerViewRecords(trx, true),
mdimViews: await getMdimViewRecords(trx),
grapherViews: await getChartsRecords(trx),
}
},
db.TransactionCloseMode.Close
)
}, db.TransactionCloseMode.Close)

// Scale grapher records and the default explorer views between 1000 and 10000,
// Scale the remaining explorer views between 0 and 1000.
// This is because Graphers are generally higher quality than Explorers and we don't want
// the data catalog to smother Grapher results with hundreds of low-quality Explorer results.
const scaledGrapherViews = scaleRecordScores(grapherViews, [1000, 10000])
const scaledExplorerViews = adaptExplorerViews(explorerViews)
const scaledMdimViews = scaleRecordScores(mdimViews, [1000, 10000])

const records = [...scaledGrapherViews, ...scaledExplorerViews]
const records = [
...scaledGrapherViews,
...scaledExplorerViews,
...scaledMdimViews,
]

const index = client.initIndex(indexName)
console.log(`Indexing ${records.length} records`)
await index.replaceAllObjects(records)
console.log(`Indexing complete`)
}

indexExplorerViewsAndChartsToAlgolia().catch(async (e) => {
console.error("Error in indexExplorerViewsAndChartsToAlgolia:", e)
indexExplorerViewsMdimViewsAndChartsToAlgolia().catch(async (e) => {
console.error("Error in indexExplorerViewsMdimViewsAndChartsToAlgolia:", e)
Sentry.captureException(e)
await Sentry.close()
process.exit(1)
Expand Down
6 changes: 4 additions & 2 deletions baker/algolia/utils/charts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@ import { getRelatedArticles } from "../../../db/model/Post.js"
import { getPublishedLinksTo } from "../../../db/model/Link.js"
import { isPathRedirectedToExplorer } from "../../../explorerAdminServer/ExplorerRedirects.js"
import { ParsedChartRecordRow, RawChartRecordRow } from "./types.js"
import { excludeNullish } from "@ourworldindata/utils"
import {
excludeNullish,
getUniqueNamesFromParentTagArrays,
} from "@ourworldindata/utils"
import { processAvailableEntities } from "./shared.js"
import { getUniqueNamesFromParentTagArrays } from "@ourworldindata/utils/dist/Util.js"

const computeChartScore = (record: Omit<ChartRecord, "score">): number => {
const { numRelatedArticles, views_7d } = record
Expand Down
3 changes: 1 addition & 2 deletions baker/algolia/utils/explorerViews.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import {

import * as db from "../../../db/db.js"
import { DATA_API_URL } from "../../../settings/serverSettings.js"
import { keyBy } from "@ourworldindata/utils"
import { getUniqueNamesFromParentTagArrays, keyBy } from "@ourworldindata/utils"
import { getAnalyticsPageviewsByUrlObj } from "../../../db/model/Pageview.js"
import {
CsvUnenrichedExplorerViewRecord,
Expand All @@ -49,7 +49,6 @@ import {
ChartRecord,
ChartRecordType,
} from "../../../site/search/searchTypes.js"
import { getUniqueNamesFromParentTagArrays } from "@ourworldindata/utils/dist/Util.js"

export function explorerViewRecordToChartRecord(
e: ExplorerViewFinalRecord
Expand Down
159 changes: 159 additions & 0 deletions baker/algolia/utils/mdimViews.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import {
ChartConfigsTableName,
DbEnrichedMultiDimDataPage,
DbPlainMultiDimXChartConfig,
DbRawChartConfig,
getUniqueNamesFromParentTagArrays,
merge,
multiDimDimensionsToViewId,
MultiDimXChartConfigsTableName,
parseChartConfig,
queryParamsToStr,
} from "@ourworldindata/utils"
import * as db from "../../../db/db.js"
import { getAllPublishedMultiDimDataPages } from "../../../db/model/MultiDimDataPage.js"
import { getAnalyticsPageviewsByUrlObj } from "../../../db/model/Pageview.js"
import { logErrorAndMaybeCaptureInSentry } from "../../../serverUtils/errorLog.js"
import { ChartRecordType } from "../../../site/search/searchTypes.js"
import {
getRelevantVariableIds,
getRelevantVariableMetadata,
} from "../../MultiDimBaker.js"

async function getChartConfigsByIds(
knex: db.KnexReadonlyTransaction,
ids: string[]
) {
const rows = await knex<DbRawChartConfig>(ChartConfigsTableName)
.select("id", "full")
.whereIn("id", ids)
return new Map(rows.map((row) => [row.id, parseChartConfig(row.full)]))
}

async function getMultiDimXChartConfigIdMap(trx: db.KnexReadonlyTransaction) {
const rows = await trx<DbPlainMultiDimXChartConfig>(
MultiDimXChartConfigsTableName
).select("id", "multiDimId", "viewId")
return new Map(
rows.map((row) => [`${row.multiDimId}-${row.viewId}`, row.id])
)
}

async function getRecords(
trx: db.KnexReadonlyTransaction,
multiDim: DbEnrichedMultiDimDataPage,
tags: string[],
pageviews: Record<string, { views_7d: number }>
) {
const { slug } = multiDim
console.log(
`Creating ${multiDim.config.views.length} records for mdim ${slug}`
)
const multiDimXChartConfigIdMap = await getMultiDimXChartConfigIdMap(trx)
const chartConfigs = await getChartConfigsByIds(
trx,
multiDim.config.views.map((view) => view.fullConfigId)
)
const relevantVariableIds = getRelevantVariableIds(multiDim.config)
const relevantVariableMetadata =
await getRelevantVariableMetadata(relevantVariableIds)
return multiDim.config.views.map((view) => {
const viewId = multiDimDimensionsToViewId(view.dimensions)
const id = multiDimXChartConfigIdMap.get(`${multiDim.id}-${viewId}`)
if (!id) {
throw new Error(
`MultiDimXChartConfig not found multiDimId=${multiDim.id} viewId=${viewId}`
)
}
const chartConfig = chartConfigs.get(view.fullConfigId)
if (!chartConfig) {
throw new Error(
`MultiDim view chart config not found id=${multiDim.id} ` +
`viewId=${viewId} chartConfigId=${view.fullConfigId}`
)
}
const queryStr = queryParamsToStr(view.dimensions)
const variableId = view.indicators.y[0].id
const metadata = merge(
relevantVariableMetadata[variableId],
multiDim.config.metadata,
view.metadata
)
const title =
metadata.presentation?.titlePublic ||
chartConfig.title ||
metadata.display?.name ||
metadata.name ||
""
const subtitle = metadata.descriptionShort || chartConfig.subtitle || ""
const availableEntities = metadata.dimensions.entities.values
.map((entity) => entity.name)
.filter(Boolean)
const views_7d = pageviews[`/grapher/${slug}`]?.views_7d ?? 0
const score = views_7d * 10 - title.length
return {
type: ChartRecordType.MultiDimView,
objectID: `mdim-view-${id}`,
chartId: -1,
slug,
queryParams: queryStr,
title,
subtitle,
variantName: chartConfig.variantName,
keyChartForTags: [],
tags,
availableEntities,
publishedAt: new Date().toISOString(),
updatedAt: new Date().toISOString(),
numDimensions: chartConfig.dimensions?.length ?? 0,
titleLength: title.length,
numRelatedArticles: 0,
views_7d,
score,
}
})
}

async function getMultiDimDataPagesWithInheritedTags(
trx: db.KnexReadonlyTransaction
) {
const multiDims = await getAllPublishedMultiDimDataPages(trx)
const parentTagArrays = await db.getParentTagArraysByChildName(trx)

const result = []
for (const multiDim of multiDims) {
const tags = multiDim.config.topicTags ?? []
if (tags.length === 0) {
await logErrorAndMaybeCaptureInSentry(
new Error(`MultiDim "${multiDim.slug}" has no tags.`)
)
}

const fullTags = new Set<string>()
for (const tag of tags) {
fullTags.add(tag)
const parentTags = getUniqueNamesFromParentTagArrays(
parentTagArrays[tag]
)
for (const parentTag of parentTags) {
fullTags.add(parentTag)
}
}

result.push({ multiDim, tags: [...fullTags] })
}

return result
}

export async function getMdimViewRecords(trx: db.KnexReadonlyTransaction) {
console.log("Getting mdim view records")
const multiDimsWithTags = await getMultiDimDataPagesWithInheritedTags(trx)
const pageviews = await getAnalyticsPageviewsByUrlObj(trx)
const records = await Promise.all(
multiDimsWithTags.map(({ multiDim, tags }) =>
getRecords(trx, multiDim, tags, pageviews)
)
)
return records.flat()
}
12 changes: 9 additions & 3 deletions db/model/MultiDimDataPage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,21 @@ const enrichRow = <T extends { config: JsonString }>(
config: JSON.parse(row.config),
})

export const getAllPublishedMultiDimDataPages = async (
export async function getAllPublishedMultiDimDataPages(
knex: KnexReadonlyTransaction
): Promise<Map<string, DbEnrichedMultiDimDataPage>> => {
): Promise<DbEnrichedMultiDimDataPage[]> {
const rows = await knex<DbPlainMultiDimDataPage>(
MultiDimDataPagesTableName
).where("published", true)
return rows.map(enrichRow)
}

export const getAllPublishedMultiDimDataPagesBySlug = async (
knex: KnexReadonlyTransaction
): Promise<Map<string, DbEnrichedMultiDimDataPage>> => {
const multiDims = await getAllPublishedMultiDimDataPages(knex)
// Published mdims must have a slug.
return new Map(rows.map((row) => [row.slug!, enrichRow(row)]))
return new Map(multiDims.map((multiDim) => [multiDim.slug!, multiDim]))
}

export async function getAllLinkedPublishedMultiDimDataPages(
Expand Down
Loading

0 comments on commit e898c1d

Please sign in to comment.