From e898c1d41352b778509e0a81bedeea1cfb35cc72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Ra=C4=8D=C3=A1k?= Date: Wed, 5 Mar 2025 14:40:22 +0100 Subject: [PATCH] Add multi-dims to data catalog --- Makefile | 4 +- adminSiteServer/multiDim.ts | 15 +- baker/MultiDimBaker.tsx | 12 +- baker/algolia/configureAlgolia.ts | 2 +- ...plorerViewsMdimViewsAndChartsToAlgolia.ts} | 27 +-- baker/algolia/utils/charts.ts | 6 +- baker/algolia/utils/explorerViews.ts | 3 +- baker/algolia/utils/mdimViews.ts | 159 ++++++++++++++++++ db/model/MultiDimDataPage.ts | 12 +- packages/@ourworldindata/utils/src/index.ts | 3 + .../@ourworldindata/utils/src/multiDim.ts | 12 ++ site/DataCatalog/DataCatalogUtils.ts | 4 +- site/search/ChartHit.tsx | 8 +- site/search/searchTypes.ts | 5 +- 14 files changed, 230 insertions(+), 42 deletions(-) rename baker/algolia/{indexExplorerViewsAndChartsToAlgolia.ts => indexExplorerViewsMdimViewsAndChartsToAlgolia.ts} (73%) create mode 100644 baker/algolia/utils/mdimViews.ts create mode 100644 packages/@ourworldindata/utils/src/multiDim.ts diff --git a/Makefile b/Makefile index 6e69b1af48..f0da863c08 100644 --- a/Makefile +++ b/Makefile @@ -293,8 +293,8 @@ reindex: itsJustJavascript node --enable-source-maps itsJustJavascript/baker/algolia/indexChartsToAlgolia.js @echo '--- Running indexExplorerViewsToAlgolia...' node --enable-source-maps itsJustJavascript/baker/algolia/indexExplorerViewsToAlgolia.js - @echo '--- Running indexExplorerViewsAndChartsToAlgolia...' - node --enable-source-maps itsJustJavascript/baker/algolia/indexExplorerViewsAndChartsToAlgolia.js + @echo '--- Running indexExplorerViewsMdimViewsAndChartsToAlgolia...' + node --enable-source-maps itsJustJavascript/baker/algolia/indexExplorerViewsMdimViewsAndChartsToAlgolia.js delete-algolia-index: itsJustJavascript @echo '==> Deleting Algolia index' diff --git a/adminSiteServer/multiDim.ts b/adminSiteServer/multiDim.ts index 76d927935a..3e6e8cd46f 100644 --- a/adminSiteServer/multiDim.ts +++ b/adminSiteServer/multiDim.ts @@ -19,7 +19,6 @@ import { MultiDimDataPageConfigPreProcessed, MultiDimDataPageConfigRaw, MultiDimDataPagesTableName, - MultiDimDimensionChoices, MultiDimXChartConfigsTableName, parseChartConfigsRow, R2GrapherConfigDirectory, @@ -28,7 +27,7 @@ import { import { mergeGrapherConfigs, MultiDimDataPageConfig, - slugify, + multiDimDimensionsToViewId, } from "@ourworldindata/utils" import * as db from "../db/db.js" import { upsertMultiDimDataPage } from "../db/model/MultiDimDataPage.js" @@ -47,14 +46,6 @@ import { updateChartConfigInDbAndR2, } from "./chartConfigHelpers.js" -function dimensionsToViewId(dimensions: MultiDimDimensionChoices) { - return Object.entries(dimensions) - .sort(([keyA], [keyB]) => keyA.localeCompare(keyB)) - .map(([_, value]) => slugify(value)) - .join("__") - .toLowerCase() -} - function catalogPathFromIndicatorEntry( entry: IndicatorEntryBeforePreProcessing ): string | undefined { @@ -289,7 +280,7 @@ export async function upsertMultiDim( patchGrapherConfig ) const existingChartConfigId = existingViewIdsToChartConfigIds.get( - dimensionsToViewId(view.dimensions) + multiDimDimensionsToViewId(view.dimensions) ) let chartConfigId if (existingChartConfigId) { @@ -330,7 +321,7 @@ export async function upsertMultiDim( for (const view of enrichedConfig.views) { await upsertMultiDimXChartConfigs(knex, { multiDimId, - viewId: dimensionsToViewId(view.dimensions), + viewId: multiDimDimensionsToViewId(view.dimensions), variableId: view.indicators.y[0].id, chartConfigId: view.fullConfigId, }) diff --git a/baker/MultiDimBaker.tsx b/baker/MultiDimBaker.tsx index d91a212281..43d8b88064 100644 --- a/baker/MultiDimBaker.tsx +++ b/baker/MultiDimBaker.tsx @@ -35,12 +35,14 @@ import { import { logErrorAndMaybeCaptureInSentry } from "../serverUtils/errorLog.js" import { getAllPublishedChartSlugs } from "../db/model/Chart.js" import { - getAllPublishedMultiDimDataPages, + getAllPublishedMultiDimDataPagesBySlug, getMultiDimDataPageByCatalogPath, getMultiDimDataPageBySlug, } from "../db/model/MultiDimDataPage.js" -const getRelevantVariableIds = (config: MultiDimDataPageConfigPreProcessed) => { +export function getRelevantVariableIds( + config: MultiDimDataPageConfigPreProcessed +) { // A "relevant" variable id is the first y indicator of each view const allIndicatorIds = config.views .map((view) => view.indicators.y?.[0]?.id) @@ -49,7 +51,9 @@ const getRelevantVariableIds = (config: MultiDimDataPageConfigPreProcessed) => { return new Set(allIndicatorIds) } -async function getRelevantVariableMetadata(variableIds: Iterable) { +export async function getRelevantVariableMetadata( + variableIds: Iterable +) { const metadata = await pMap( variableIds, async (id) => { @@ -248,7 +252,7 @@ export const bakeAllMultiDimDataPages = async ( bakedSiteDir: string, imageMetadata: Record ) => { - const multiDimsBySlug = await getAllPublishedMultiDimDataPages(knex) + const multiDimsBySlug = await getAllPublishedMultiDimDataPagesBySlug(knex) const progressBar = new ProgressBar( "bake multi-dim page [:bar] :current/:total :elapseds :rate/s :name\n", { diff --git a/baker/algolia/configureAlgolia.ts b/baker/algolia/configureAlgolia.ts index e1ec5953cf..30bde7a911 100644 --- a/baker/algolia/configureAlgolia.ts +++ b/baker/algolia/configureAlgolia.ts @@ -161,7 +161,7 @@ export const configureAlgolia = async () => { }) const explorerViewsAndChartsIndex = client.initIndex( - getIndexName(SearchIndexName.ExplorerViewsAndCharts) + getIndexName(SearchIndexName.ExplorerViewsMdimViewsAndCharts) ) await explorerViewsAndChartsIndex.setSettings({ diff --git a/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts b/baker/algolia/indexExplorerViewsMdimViewsAndChartsToAlgolia.ts similarity index 73% rename from baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts rename to baker/algolia/indexExplorerViewsMdimViewsAndChartsToAlgolia.ts index e2ea4d1b8b..7c49ab182e 100644 --- a/baker/algolia/indexExplorerViewsAndChartsToAlgolia.ts +++ b/baker/algolia/indexExplorerViewsMdimViewsAndChartsToAlgolia.ts @@ -14,13 +14,16 @@ import { scaleRecordScores } from "./utils/shared.js" import { getChartsRecords } from "./utils/charts.js" import { getIndexName } from "../../site/search/searchClient.js" import { SearchIndexName } from "../../site/search/searchTypes.js" +import { getMdimViewRecords } from "./utils/mdimViews.js" // We get 200k operations with Algolia's Open Source plan. We've hit 140k in the past so this might push us over. // If we standardize the record shape, we could have this be the only index and have a `type` field // to use in /search. -const indexExplorerViewsAndChartsToAlgolia = async () => { +const indexExplorerViewsMdimViewsAndChartsToAlgolia = async () => { if (!ALGOLIA_INDEXING) return - const indexName = getIndexName(SearchIndexName.ExplorerViewsAndCharts) + const indexName = getIndexName( + SearchIndexName.ExplorerViewsMdimViewsAndCharts + ) console.log( `Indexing explorer views and charts to the "${indexName}" index on Algolia` ) @@ -31,15 +34,14 @@ const indexExplorerViewsAndChartsToAlgolia = async () => { ) } - const { explorerViews, grapherViews } = await db.knexReadonlyTransaction( - async (trx) => { + const { explorerViews, mdimViews, grapherViews } = + await db.knexReadonlyTransaction(async (trx) => { return { explorerViews: await getExplorerViewRecords(trx, true), + mdimViews: await getMdimViewRecords(trx), grapherViews: await getChartsRecords(trx), } - }, - db.TransactionCloseMode.Close - ) + }, db.TransactionCloseMode.Close) // Scale grapher records and the default explorer views between 1000 and 10000, // Scale the remaining explorer views between 0 and 1000. @@ -47,8 +49,13 @@ const indexExplorerViewsAndChartsToAlgolia = async () => { // the data catalog to smother Grapher results with hundreds of low-quality Explorer results. const scaledGrapherViews = scaleRecordScores(grapherViews, [1000, 10000]) const scaledExplorerViews = adaptExplorerViews(explorerViews) + const scaledMdimViews = scaleRecordScores(mdimViews, [1000, 10000]) - const records = [...scaledGrapherViews, ...scaledExplorerViews] + const records = [ + ...scaledGrapherViews, + ...scaledExplorerViews, + ...scaledMdimViews, + ] const index = client.initIndex(indexName) console.log(`Indexing ${records.length} records`) @@ -56,8 +63,8 @@ const indexExplorerViewsAndChartsToAlgolia = async () => { console.log(`Indexing complete`) } -indexExplorerViewsAndChartsToAlgolia().catch(async (e) => { - console.error("Error in indexExplorerViewsAndChartsToAlgolia:", e) +indexExplorerViewsMdimViewsAndChartsToAlgolia().catch(async (e) => { + console.error("Error in indexExplorerViewsMdimViewsAndChartsToAlgolia:", e) Sentry.captureException(e) await Sentry.close() process.exit(1) diff --git a/baker/algolia/utils/charts.ts b/baker/algolia/utils/charts.ts index ac74e15353..ca1a1ef6b7 100644 --- a/baker/algolia/utils/charts.ts +++ b/baker/algolia/utils/charts.ts @@ -11,9 +11,11 @@ import { getRelatedArticles } from "../../../db/model/Post.js" import { getPublishedLinksTo } from "../../../db/model/Link.js" import { isPathRedirectedToExplorer } from "../../../explorerAdminServer/ExplorerRedirects.js" import { ParsedChartRecordRow, RawChartRecordRow } from "./types.js" -import { excludeNullish } from "@ourworldindata/utils" +import { + excludeNullish, + getUniqueNamesFromParentTagArrays, +} from "@ourworldindata/utils" import { processAvailableEntities } from "./shared.js" -import { getUniqueNamesFromParentTagArrays } from "@ourworldindata/utils/dist/Util.js" const computeChartScore = (record: Omit): number => { const { numRelatedArticles, views_7d } = record diff --git a/baker/algolia/utils/explorerViews.ts b/baker/algolia/utils/explorerViews.ts index 548fce8d71..7faa510517 100644 --- a/baker/algolia/utils/explorerViews.ts +++ b/baker/algolia/utils/explorerViews.ts @@ -23,7 +23,7 @@ import { import * as db from "../../../db/db.js" import { DATA_API_URL } from "../../../settings/serverSettings.js" -import { keyBy } from "@ourworldindata/utils" +import { getUniqueNamesFromParentTagArrays, keyBy } from "@ourworldindata/utils" import { getAnalyticsPageviewsByUrlObj } from "../../../db/model/Pageview.js" import { CsvUnenrichedExplorerViewRecord, @@ -49,7 +49,6 @@ import { ChartRecord, ChartRecordType, } from "../../../site/search/searchTypes.js" -import { getUniqueNamesFromParentTagArrays } from "@ourworldindata/utils/dist/Util.js" export function explorerViewRecordToChartRecord( e: ExplorerViewFinalRecord diff --git a/baker/algolia/utils/mdimViews.ts b/baker/algolia/utils/mdimViews.ts new file mode 100644 index 0000000000..b101000942 --- /dev/null +++ b/baker/algolia/utils/mdimViews.ts @@ -0,0 +1,159 @@ +import { + ChartConfigsTableName, + DbEnrichedMultiDimDataPage, + DbPlainMultiDimXChartConfig, + DbRawChartConfig, + getUniqueNamesFromParentTagArrays, + merge, + multiDimDimensionsToViewId, + MultiDimXChartConfigsTableName, + parseChartConfig, + queryParamsToStr, +} from "@ourworldindata/utils" +import * as db from "../../../db/db.js" +import { getAllPublishedMultiDimDataPages } from "../../../db/model/MultiDimDataPage.js" +import { getAnalyticsPageviewsByUrlObj } from "../../../db/model/Pageview.js" +import { logErrorAndMaybeCaptureInSentry } from "../../../serverUtils/errorLog.js" +import { ChartRecordType } from "../../../site/search/searchTypes.js" +import { + getRelevantVariableIds, + getRelevantVariableMetadata, +} from "../../MultiDimBaker.js" + +async function getChartConfigsByIds( + knex: db.KnexReadonlyTransaction, + ids: string[] +) { + const rows = await knex(ChartConfigsTableName) + .select("id", "full") + .whereIn("id", ids) + return new Map(rows.map((row) => [row.id, parseChartConfig(row.full)])) +} + +async function getMultiDimXChartConfigIdMap(trx: db.KnexReadonlyTransaction) { + const rows = await trx( + MultiDimXChartConfigsTableName + ).select("id", "multiDimId", "viewId") + return new Map( + rows.map((row) => [`${row.multiDimId}-${row.viewId}`, row.id]) + ) +} + +async function getRecords( + trx: db.KnexReadonlyTransaction, + multiDim: DbEnrichedMultiDimDataPage, + tags: string[], + pageviews: Record +) { + const { slug } = multiDim + console.log( + `Creating ${multiDim.config.views.length} records for mdim ${slug}` + ) + const multiDimXChartConfigIdMap = await getMultiDimXChartConfigIdMap(trx) + const chartConfigs = await getChartConfigsByIds( + trx, + multiDim.config.views.map((view) => view.fullConfigId) + ) + const relevantVariableIds = getRelevantVariableIds(multiDim.config) + const relevantVariableMetadata = + await getRelevantVariableMetadata(relevantVariableIds) + return multiDim.config.views.map((view) => { + const viewId = multiDimDimensionsToViewId(view.dimensions) + const id = multiDimXChartConfigIdMap.get(`${multiDim.id}-${viewId}`) + if (!id) { + throw new Error( + `MultiDimXChartConfig not found multiDimId=${multiDim.id} viewId=${viewId}` + ) + } + const chartConfig = chartConfigs.get(view.fullConfigId) + if (!chartConfig) { + throw new Error( + `MultiDim view chart config not found id=${multiDim.id} ` + + `viewId=${viewId} chartConfigId=${view.fullConfigId}` + ) + } + const queryStr = queryParamsToStr(view.dimensions) + const variableId = view.indicators.y[0].id + const metadata = merge( + relevantVariableMetadata[variableId], + multiDim.config.metadata, + view.metadata + ) + const title = + metadata.presentation?.titlePublic || + chartConfig.title || + metadata.display?.name || + metadata.name || + "" + const subtitle = metadata.descriptionShort || chartConfig.subtitle || "" + const availableEntities = metadata.dimensions.entities.values + .map((entity) => entity.name) + .filter(Boolean) + const views_7d = pageviews[`/grapher/${slug}`]?.views_7d ?? 0 + const score = views_7d * 10 - title.length + return { + type: ChartRecordType.MultiDimView, + objectID: `mdim-view-${id}`, + chartId: -1, + slug, + queryParams: queryStr, + title, + subtitle, + variantName: chartConfig.variantName, + keyChartForTags: [], + tags, + availableEntities, + publishedAt: new Date().toISOString(), + updatedAt: new Date().toISOString(), + numDimensions: chartConfig.dimensions?.length ?? 0, + titleLength: title.length, + numRelatedArticles: 0, + views_7d, + score, + } + }) +} + +async function getMultiDimDataPagesWithInheritedTags( + trx: db.KnexReadonlyTransaction +) { + const multiDims = await getAllPublishedMultiDimDataPages(trx) + const parentTagArrays = await db.getParentTagArraysByChildName(trx) + + const result = [] + for (const multiDim of multiDims) { + const tags = multiDim.config.topicTags ?? [] + if (tags.length === 0) { + await logErrorAndMaybeCaptureInSentry( + new Error(`MultiDim "${multiDim.slug}" has no tags.`) + ) + } + + const fullTags = new Set() + for (const tag of tags) { + fullTags.add(tag) + const parentTags = getUniqueNamesFromParentTagArrays( + parentTagArrays[tag] + ) + for (const parentTag of parentTags) { + fullTags.add(parentTag) + } + } + + result.push({ multiDim, tags: [...fullTags] }) + } + + return result +} + +export async function getMdimViewRecords(trx: db.KnexReadonlyTransaction) { + console.log("Getting mdim view records") + const multiDimsWithTags = await getMultiDimDataPagesWithInheritedTags(trx) + const pageviews = await getAnalyticsPageviewsByUrlObj(trx) + const records = await Promise.all( + multiDimsWithTags.map(({ multiDim, tags }) => + getRecords(trx, multiDim, tags, pageviews) + ) + ) + return records.flat() +} diff --git a/db/model/MultiDimDataPage.ts b/db/model/MultiDimDataPage.ts index 015fe35206..b1a48c2088 100644 --- a/db/model/MultiDimDataPage.ts +++ b/db/model/MultiDimDataPage.ts @@ -52,15 +52,21 @@ const enrichRow = ( config: JSON.parse(row.config), }) -export const getAllPublishedMultiDimDataPages = async ( +export async function getAllPublishedMultiDimDataPages( knex: KnexReadonlyTransaction -): Promise> => { +): Promise { const rows = await knex( MultiDimDataPagesTableName ).where("published", true) + return rows.map(enrichRow) +} +export const getAllPublishedMultiDimDataPagesBySlug = async ( + knex: KnexReadonlyTransaction +): Promise> => { + const multiDims = await getAllPublishedMultiDimDataPages(knex) // Published mdims must have a slug. - return new Map(rows.map((row) => [row.slug!, enrichRow(row)])) + return new Map(multiDims.map((multiDim) => [multiDim.slug!, multiDim])) } export async function getAllLinkedPublishedMultiDimDataPages( diff --git a/packages/@ourworldindata/utils/src/index.ts b/packages/@ourworldindata/utils/src/index.ts index f074bf0f14..2bdc5416d3 100644 --- a/packages/@ourworldindata/utils/src/index.ts +++ b/packages/@ourworldindata/utils/src/index.ts @@ -133,6 +133,7 @@ export { isArrayDifferentFromReference, readFromAssetMap, downloadImage, + getUniqueNamesFromParentTagArrays, getUserNavigatorLanguages, getUserNavigatorLanguagesNonEnglish, } from "./Util.js" @@ -154,6 +155,8 @@ export { grabMetadataForGdocLinkedIndicator, } from "./metadataHelpers.js" +export { multiDimDimensionsToViewId } from "./multiDim.js" + export { capitalize, chunk, diff --git a/packages/@ourworldindata/utils/src/multiDim.ts b/packages/@ourworldindata/utils/src/multiDim.ts new file mode 100644 index 0000000000..0d3f20c12d --- /dev/null +++ b/packages/@ourworldindata/utils/src/multiDim.ts @@ -0,0 +1,12 @@ +import { MultiDimDimensionChoices } from "@ourworldindata/types" +import { slugify } from "./Util.js" + +export function multiDimDimensionsToViewId( + dimensions: MultiDimDimensionChoices +): string { + return Object.entries(dimensions) + .sort(([keyA], [keyB]) => keyA.localeCompare(keyB)) + .map(([_, value]) => slugify(value)) + .join("__") + .toLowerCase() +} diff --git a/site/DataCatalog/DataCatalogUtils.ts b/site/DataCatalog/DataCatalogUtils.ts index d43d35e1c4..2bf2a83b0d 100644 --- a/site/DataCatalog/DataCatalogUtils.ts +++ b/site/DataCatalog/DataCatalogUtils.ts @@ -13,7 +13,9 @@ import { SearchClient } from "algoliasearch" /** * Constants */ -const CHARTS_INDEX = getIndexName(SearchIndexName.ExplorerViewsAndCharts) +const CHARTS_INDEX = getIndexName( + SearchIndexName.ExplorerViewsMdimViewsAndCharts +) const DATA_CATALOG_ATTRIBUTES = [ "title", diff --git a/site/search/ChartHit.tsx b/site/search/ChartHit.tsx index ba1108721e..fae8ffc84a 100644 --- a/site/search/ChartHit.tsx +++ b/site/search/ChartHit.tsx @@ -38,6 +38,7 @@ export function ChartHit({ const [imgLoaded, setImgLoaded] = useState(false) const [imgError, setImgError] = useState(false) const isExplorerView = hit.type === ChartRecordType.ExplorerView + const isMultiDimView = hit.type === ChartRecordType.MultiDimView const entities = useMemo( () => @@ -59,9 +60,10 @@ export function ChartHit({ [entities] ) - const fullQueryParams = isExplorerView - ? hit.queryParams! + entityQueryStr.replace("?", "&") - : entityQueryStr + const fullQueryParams = + isExplorerView || isMultiDimView + ? hit.queryParams! + entityQueryStr.replace("?", "&") + : entityQueryStr function createExplorerViewThumbnailUrl( slug: string, diff --git a/site/search/searchTypes.ts b/site/search/searchTypes.ts index a50a7896dd..facf67f085 100644 --- a/site/search/searchTypes.ts +++ b/site/search/searchTypes.ts @@ -59,6 +59,7 @@ export type IExplorerViewHit = Hit & { export enum ChartRecordType { Chart = "chart", ExplorerView = "explorerView", + MultiDimView = "multiDimView", } export interface ChartRecord { @@ -88,7 +89,7 @@ export enum SearchIndexName { ExplorerViews = "explorer-views", Charts = "charts", Pages = "pages", - ExplorerViewsAndCharts = "explorer-views-and-charts", + ExplorerViewsMdimViewsAndCharts = "explorer-views-and-charts", } export type SearchCategoryFilter = SearchIndexName | "all" @@ -105,5 +106,5 @@ export const indexNameToSubdirectoryMap: Record = { [SearchIndexName.Charts]: "/grapher", [SearchIndexName.ExplorerViews]: "/explorers", // n/a - charts and explorers have different subdirectories, so this needs to be resolved elsewhere - [SearchIndexName.ExplorerViewsAndCharts]: "", + [SearchIndexName.ExplorerViewsMdimViewsAndCharts]: "", }