diff --git a/.teamcity/builds/kotlinlang/buidTypes/BuildSearchIndex.kt b/.teamcity/builds/kotlinlang/buidTypes/BuildSearchIndex.kt index 8469dbae392..30b92f41dd9 100644 --- a/.teamcity/builds/kotlinlang/buidTypes/BuildSearchIndex.kt +++ b/.teamcity/builds/kotlinlang/buidTypes/BuildSearchIndex.kt @@ -11,13 +11,11 @@ import jetbrains.buildServer.configs.kotlin.buildSteps.script import jetbrains.buildServer.configs.kotlin.triggers.schedule import vcsRoots.KotlinLangOrg - object BuildSearchIndex : BuildType({ name = "Build Site Search Index" description = "Build search index for Algolia using Google Analytics data" params { - param("env.KEY_FILE_LOCATION", "/secrets/google-credentials.json") param("virtualenv.folder", "_environment") param("env.WH_INDEX_NAME", SEARCH_INDEX_NAME) param("env.WH_SEARCH_USER", SEARCH_APP_ID) @@ -33,12 +31,9 @@ object BuildSearchIndex : BuildType({ steps { script { + name = "Push search index" scriptContent = """ - #!/bin/bash - - ## refresh packages - pip install -r requirements.txt - + #!/bin/bash python kotlin-website.py index """.trimIndent() dockerImage = "%dep.Kotlin_KotlinSites_Builds_KotlinlangOrg_BuildPythonContainer.kotlin-website-image%" @@ -71,6 +66,17 @@ object BuildSearchIndex : BuildType({ onDependencyFailure = FailureAction.FAIL_TO_START onDependencyCancel = FailureAction.CANCEL } + + dependency(PageViews) { + snapshot {} + + artifacts { + artifactRules = """ + page_views_map.json => data/ + """.trimIndent() + } + } + dependency(BuildSitePages) { snapshot {} diff --git a/.teamcity/builds/kotlinlang/buidTypes/PageViews.kt b/.teamcity/builds/kotlinlang/buidTypes/PageViews.kt index 65de6038a0b..497cf86d4b4 100644 --- a/.teamcity/builds/kotlinlang/buidTypes/PageViews.kt +++ b/.teamcity/builds/kotlinlang/buidTypes/PageViews.kt @@ -1,8 +1,58 @@ package builds.kotlinlang.buidTypes +import jetbrains.buildServer.configs.kotlin.AbsoluteId import jetbrains.buildServer.configs.kotlin.BuildType +import jetbrains.buildServer.configs.kotlin.buildSteps.ScriptBuildStep +import jetbrains.buildServer.configs.kotlin.buildSteps.script +import jetbrains.buildServer.configs.kotlin.triggers.finishBuildTrigger +import java.io.File +import java.nio.file.Paths + +private fun readScript(name: String): String { + val file = File(Paths.get("scripts/$name.mjs").toAbsolutePath().toString()) + return file.readText() +} + +private val pageViewsCollectId = AbsoluteId("WebTeam_BuildsForDeploymentJetBrainsCom_Algolia_PageViewsFromGoogle") object PageViews : BuildType({ name = "Fetch Page Views" description = "Build data files with page views statistics for kotlin websites" + + artifactRules = """ + page_views_list.json + page_views_map.json + """.trimIndent() + + triggers { + finishBuildTrigger { + buildType = pageViewsCollectId.absoluteId + branchFilter = "+:" + successfulOnly = true + } + } + + steps { + script { + name = "Prepare page views" + scriptContent = """ + #!/usr/bin/env bash + ":" //# comment; exec /usr/bin/env node --input-type=module - "${'$'}@" < "${'$'}0" + + ${readScript("stats/pageviews")} + """.trimIndent() + dockerImage = "node:lts-slim" + dockerImagePlatform = ScriptBuildStep.ImagePlatform.Linux + dockerPull = true + } + } + + dependencies { + artifacts(pageViewsCollectId) { + buildRule = lastSuccessful() + artifactRules = """ + +:unique_pageviews_pages_000000000000.json => data + """.trimIndent() + } + } }) diff --git a/.teamcity/scripts/stats/pageviews.mjs b/.teamcity/scripts/stats/pageviews.mjs new file mode 100644 index 00000000000..398dc9d4368 --- /dev/null +++ b/.teamcity/scripts/stats/pageviews.mjs @@ -0,0 +1,74 @@ +import { open } from 'node:fs/promises'; + +const INPUT_FILE_PATH = 'data/unique_pageviews_pages_000000000000.json'; +const input = await open(INPUT_FILE_PATH, 'r'); + +async function openReportFile(name) { + const file = await open(name, 'w'); + await file.truncate(0); + return file; +} + +const [listViews, mapViews] = await Promise.all([ + openReportFile('page_views_list.json'), + openReportFile('page_views_map.json'), +]); + +try { + async function append(line) { + const { webpage: url, unique_pageviews: views } = JSON.parse(line); + + const pageviews = Number(views); + + if (views === '' || isNaN(pageviews)) { + console.warn(`${url} has incorrect unique_pageviews=${views}`); + return; + } + + if (pageviews < 1) return; + if (!(new URL(url).host.includes('kotlinlang.org'))) return; + + await Promise.all([ + listViews.appendFile(JSON.stringify({ url, pageviews }) + ','), + mapViews.appendFile(`${JSON.stringify(url)}: ${pageviews},`), + ]); + } + + const lines = []; + + await Promise.all([ + listViews.write('['), + mapViews.write('{'), + ]); + + const readlineInterface = input.readLines(); + + readlineInterface.on('line', line => { + lines.push(append(line)); + }); + + const waitInputRead = new Promise(resolve => { + readlineInterface.on('close', () => { + resolve(); + }); + }); + + await waitInputRead; + await Promise.all(lines); + + async function replaceLastCharacter(file, ch) { + const { size } = await file.stat(); + file.write(ch, size - 1); + } + + await Promise.all([ + replaceLastCharacter(listViews, ']'), + replaceLastCharacter(mapViews, '}'), + ]); +} finally { + await Promise.all([ + input.close(), + listViews.close(), + mapViews.close(), + ]); +} diff --git a/requirements.txt b/requirements.txt index 8f2af9b33ef..f0f813faae2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,5 +14,4 @@ git+https://github.com/pik-software/geocoder.git@yandex-api-key#egg=geocoder ruamel.yaml==0.17.21 PyYAML==5.4.1 algoliasearch==1.20.0 -google-api-python-client==1.6.2 Werkzeug==2.3.8 diff --git a/src/search.py b/src/search.py index ad6013e2272..d09e49e961e 100644 --- a/src/search.py +++ b/src/search.py @@ -1,72 +1,24 @@ +import json import os from typing import Dict, List, Iterator from algoliasearch import algoliasearch from algoliasearch.index import Index from bs4 import Tag -from googleapiclient.discovery import build, Resource -from oauth2client.service_account import ServiceAccountCredentials from src.api import get_api_page from src.dist import get_dist_page_xml, dist_path -def initialize_analyticsreporting() -> Resource: - credentials = ServiceAccountCredentials.from_json_keyfile_name( - os.environ['KEY_FILE_LOCATION'], scopes='https://www.googleapis.com/auth/analytics.readonly') - analytics = build('analyticsreporting', 'v4', credentials=credentials) - return analytics - - -def get_report(analytics: Resource) -> Dict: - return analytics.reports().batchGet( - body={ - "reportRequests": - [ - { - "viewId": "85132606", - "samplingLevel": "LARGE", - "filtersExpression": "ga:hostname==kotlinlang.org;ga:pagepath!@?", - "pageSize": 10000, - "orderBys": [ - { - "fieldName": "ga:uniquepageviews", - "sortOrder": "DESCENDING" - } - ], - "dateRanges": - [ - { - "startDate": "30daysAgo", - "endDate": "yesterday" - } - ], - "metrics": - [ - { - "expression": "ga:uniquepageviews", - "alias": "" - } - ], - "dimensions": - [ - { - "name": "ga:pagePath" - } - ] - } - ] - }).execute() +def get_page_views_statistic() -> Dict[str, int]: + print("Acquiring page view statistic") + file = open("data/page_views_map.json", "r") + page_views = json.load(file) + file.close() -def get_page_views_statistic() -> Dict[str, int]: - print("Acquiring page view statistic from google") - page_views = {} - analytics = initialize_analyticsreporting() - report = get_report(analytics) - for row in report["reports"][0]["data"]["rows"]: - page_views[row["dimensions"][0]] = int(row['metrics'][0]["values"][0]) print("Page view statistic acquired") + return page_views @@ -110,7 +62,8 @@ def get_valuable_content(page_path, content: Iterator[Tag]) -> List[str]: valuable_content.append(child.text) elif child.name in ['ul', 'ol', 'blockquote', 'div', 'section', 'dl']: valuable_content += get_valuable_content(page_path, child.children) - elif child.name in ['figure', 'iframe', 'pre', 'code', 'hr', 'table', 'script', 'link', 'a', 'br', 'i', 'img', 'object']: + elif child.name in ['figure', 'iframe', 'pre', 'code', 'hr', 'table', 'script', 'link', 'a', 'br', 'i', 'img', + 'object']: continue else: raise Exception('Unknown tag "' + child.name + '" in ' + page_path) @@ -243,8 +196,9 @@ def build_search_indices(pages): page_path = get_page_path_from_url(url) page_views = 0 - if url in page_views_statistic: - page_views = page_views_statistic[url] + public_url = "https://kotlinlang.org" + url + if public_url in page_views_statistic: + page_views = page_views_statistic[public_url] if type == 'Page_Community': page_type = 'Community'