Skip to content

Commit

Permalink
Merge pull request #4313 from JetBrains/ktl-1516-migration-ga-in-search
Browse files Browse the repository at this point in the history
KTL-1516: migration ga in search
  • Loading branch information
zoobestik authored Jul 1, 2024
2 parents 5c0f322 + c5c3efd commit b44e7e3
Show file tree
Hide file tree
Showing 5 changed files with 149 additions and 66 deletions.
20 changes: 13 additions & 7 deletions .teamcity/builds/kotlinlang/buidTypes/BuildSearchIndex.kt
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,11 @@ import jetbrains.buildServer.configs.kotlin.buildSteps.script
import jetbrains.buildServer.configs.kotlin.triggers.schedule
import vcsRoots.KotlinLangOrg


object BuildSearchIndex : BuildType({
name = "Build Site Search Index"
description = "Build search index for Algolia using Google Analytics data"

params {
param("env.KEY_FILE_LOCATION", "/secrets/google-credentials.json")
param("virtualenv.folder", "_environment")
param("env.WH_INDEX_NAME", SEARCH_INDEX_NAME)
param("env.WH_SEARCH_USER", SEARCH_APP_ID)
Expand All @@ -33,12 +31,9 @@ object BuildSearchIndex : BuildType({

steps {
script {
name = "Push search index"
scriptContent = """
#!/bin/bash
## refresh packages
pip install -r requirements.txt
#!/bin/bash
python kotlin-website.py index
""".trimIndent()
dockerImage = "%dep.Kotlin_KotlinSites_Builds_KotlinlangOrg_BuildPythonContainer.kotlin-website-image%"
Expand Down Expand Up @@ -71,6 +66,17 @@ object BuildSearchIndex : BuildType({
onDependencyFailure = FailureAction.FAIL_TO_START
onDependencyCancel = FailureAction.CANCEL
}

dependency(PageViews) {
snapshot {}

artifacts {
artifactRules = """
page_views_map.json => data/
""".trimIndent()
}
}

dependency(BuildSitePages) {
snapshot {}

Expand Down
50 changes: 50 additions & 0 deletions .teamcity/builds/kotlinlang/buidTypes/PageViews.kt
Original file line number Diff line number Diff line change
@@ -1,8 +1,58 @@
package builds.kotlinlang.buidTypes

import jetbrains.buildServer.configs.kotlin.AbsoluteId
import jetbrains.buildServer.configs.kotlin.BuildType
import jetbrains.buildServer.configs.kotlin.buildSteps.ScriptBuildStep
import jetbrains.buildServer.configs.kotlin.buildSteps.script
import jetbrains.buildServer.configs.kotlin.triggers.finishBuildTrigger
import java.io.File
import java.nio.file.Paths

private fun readScript(name: String): String {
val file = File(Paths.get("scripts/$name.mjs").toAbsolutePath().toString())
return file.readText()
}

private val pageViewsCollectId = AbsoluteId("WebTeam_BuildsForDeploymentJetBrainsCom_Algolia_PageViewsFromGoogle")

object PageViews : BuildType({
name = "Fetch Page Views"
description = "Build data files with page views statistics for kotlin websites"

artifactRules = """
page_views_list.json
page_views_map.json
""".trimIndent()

triggers {
finishBuildTrigger {
buildType = pageViewsCollectId.absoluteId
branchFilter = "+:<default>"
successfulOnly = true
}
}

steps {
script {
name = "Prepare page views"
scriptContent = """
#!/usr/bin/env bash
":" //# comment; exec /usr/bin/env node --input-type=module - "${'$'}@" < "${'$'}0"
${readScript("stats/pageviews")}
""".trimIndent()
dockerImage = "node:lts-slim"
dockerImagePlatform = ScriptBuildStep.ImagePlatform.Linux
dockerPull = true
}
}

dependencies {
artifacts(pageViewsCollectId) {
buildRule = lastSuccessful()
artifactRules = """
+:unique_pageviews_pages_000000000000.json => data
""".trimIndent()
}
}
})
74 changes: 74 additions & 0 deletions .teamcity/scripts/stats/pageviews.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import { open } from 'node:fs/promises';

const INPUT_FILE_PATH = 'data/unique_pageviews_pages_000000000000.json';
const input = await open(INPUT_FILE_PATH, 'r');

async function openReportFile(name) {
const file = await open(name, 'w');
await file.truncate(0);
return file;
}

const [listViews, mapViews] = await Promise.all([
openReportFile('page_views_list.json'),
openReportFile('page_views_map.json'),
]);

try {
async function append(line) {
const { webpage: url, unique_pageviews: views } = JSON.parse(line);

const pageviews = Number(views);

if (views === '' || isNaN(pageviews)) {
console.warn(`${url} has incorrect unique_pageviews=${views}`);
return;
}

if (pageviews < 1) return;
if (!(new URL(url).host.includes('kotlinlang.org'))) return;

await Promise.all([
listViews.appendFile(JSON.stringify({ url, pageviews }) + ','),
mapViews.appendFile(`${JSON.stringify(url)}: ${pageviews},`),
]);
}

const lines = [];

await Promise.all([
listViews.write('['),
mapViews.write('{'),
]);

const readlineInterface = input.readLines();

readlineInterface.on('line', line => {
lines.push(append(line));
});

const waitInputRead = new Promise(resolve => {
readlineInterface.on('close', () => {
resolve();
});
});

await waitInputRead;
await Promise.all(lines);

async function replaceLastCharacter(file, ch) {
const { size } = await file.stat();
file.write(ch, size - 1);
}

await Promise.all([
replaceLastCharacter(listViews, ']'),
replaceLastCharacter(mapViews, '}'),
]);
} finally {
await Promise.all([
input.close(),
listViews.close(),
mapViews.close(),
]);
}
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,4 @@ git+https://github.com/pik-software/geocoder.git@yandex-api-key#egg=geocoder
ruamel.yaml==0.17.21
PyYAML==5.4.1
algoliasearch==1.20.0
google-api-python-client==1.6.2
Werkzeug==2.3.8
70 changes: 12 additions & 58 deletions src/search.py
Original file line number Diff line number Diff line change
@@ -1,72 +1,24 @@
import json
import os
from typing import Dict, List, Iterator

from algoliasearch import algoliasearch
from algoliasearch.index import Index
from bs4 import Tag
from googleapiclient.discovery import build, Resource
from oauth2client.service_account import ServiceAccountCredentials

from src.api import get_api_page
from src.dist import get_dist_page_xml, dist_path


def initialize_analyticsreporting() -> Resource:
credentials = ServiceAccountCredentials.from_json_keyfile_name(
os.environ['KEY_FILE_LOCATION'], scopes='https://www.googleapis.com/auth/analytics.readonly')
analytics = build('analyticsreporting', 'v4', credentials=credentials)
return analytics


def get_report(analytics: Resource) -> Dict:
return analytics.reports().batchGet(
body={
"reportRequests":
[
{
"viewId": "85132606",
"samplingLevel": "LARGE",
"filtersExpression": "ga:hostname==kotlinlang.org;ga:pagepath!@?",
"pageSize": 10000,
"orderBys": [
{
"fieldName": "ga:uniquepageviews",
"sortOrder": "DESCENDING"
}
],
"dateRanges":
[
{
"startDate": "30daysAgo",
"endDate": "yesterday"
}
],
"metrics":
[
{
"expression": "ga:uniquepageviews",
"alias": ""
}
],
"dimensions":
[
{
"name": "ga:pagePath"
}
]
}
]
}).execute()
def get_page_views_statistic() -> Dict[str, int]:
print("Acquiring page view statistic")

file = open("data/page_views_map.json", "r")
page_views = json.load(file)
file.close()

def get_page_views_statistic() -> Dict[str, int]:
print("Acquiring page view statistic from google")
page_views = {}
analytics = initialize_analyticsreporting()
report = get_report(analytics)
for row in report["reports"][0]["data"]["rows"]:
page_views[row["dimensions"][0]] = int(row['metrics'][0]["values"][0])
print("Page view statistic acquired")

return page_views


Expand Down Expand Up @@ -110,7 +62,8 @@ def get_valuable_content(page_path, content: Iterator[Tag]) -> List[str]:
valuable_content.append(child.text)
elif child.name in ['ul', 'ol', 'blockquote', 'div', 'section', 'dl']:
valuable_content += get_valuable_content(page_path, child.children)
elif child.name in ['figure', 'iframe', 'pre', 'code', 'hr', 'table', 'script', 'link', 'a', 'br', 'i', 'img', 'object']:
elif child.name in ['figure', 'iframe', 'pre', 'code', 'hr', 'table', 'script', 'link', 'a', 'br', 'i', 'img',
'object']:
continue
else:
raise Exception('Unknown tag "' + child.name + '" in ' + page_path)
Expand Down Expand Up @@ -243,8 +196,9 @@ def build_search_indices(pages):
page_path = get_page_path_from_url(url)
page_views = 0

if url in page_views_statistic:
page_views = page_views_statistic[url]
public_url = "https://kotlinlang.org" + url
if public_url in page_views_statistic:
page_views = page_views_statistic[public_url]

if type == 'Page_Community':
page_type = 'Community'
Expand Down

0 comments on commit b44e7e3

Please sign in to comment.