Skip to content

Commit

Permalink
fix(assistant): increase properties sample count (#27967)
Browse files Browse the repository at this point in the history
  • Loading branch information
skoob13 authored Jan 29, 2025
1 parent fff6794 commit 00f2496
Show file tree
Hide file tree
Showing 10 changed files with 127 additions and 19 deletions.
20 changes: 10 additions & 10 deletions ee/hogai/taxonomy_agent/test/test_toolkit.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,28 +124,28 @@ def test_retrieve_entity_property_values(self):
team=self.team, type=PropertyDefinition.Type.PERSON, name="id", property_type=PropertyType.Numeric
)

for i in range(5):
for i in range(25):
id = f"person{i}"
with freeze_time(f"2024-01-01T{i}:00:00Z"):
with freeze_time(f"2024-01-01T00:{i}:00Z"):
_create_person(
distinct_ids=[id],
properties={"email": f"{id}@example.com", "id": i},
team=self.team,
)
with freeze_time(f"2024-01-02T00:00:00Z"):
_create_person(
distinct_ids=["person5"],
properties={"email": "person5@example.com", "id": 5},
distinct_ids=["person25"],
properties={"email": "person25@example.com", "id": 25},
team=self.team,
)

self.assertEqual(
self.assertIn(
'"[email protected]", "[email protected]", "[email protected]", "[email protected]", "[email protected]"',
toolkit.retrieve_entity_property_values("person", "email"),
'"[email protected]", "[email protected]", "[email protected]", "[email protected]", "[email protected]" and 1 more distinct value.',
)
self.assertEqual(
self.assertIn(
"1 more distinct value",
toolkit.retrieve_entity_property_values("person", "id"),
"5, 4, 3, 2, 1 and 1 more distinct value.",
)

toolkit = DummyToolkit(self.team)
Expand Down Expand Up @@ -181,7 +181,7 @@ def test_retrieve_entity_property_values(self):

self.assertEqual(
toolkit.retrieve_entity_property_values("proj", "test"),
"6, 5, 4, 3, 2 and 2 more distinct values.",
"6, 5, 4, 3, 2, 1, 0",
)
self.assertEqual(toolkit.retrieve_entity_property_values("org", "test"), '"7"')

Expand Down Expand Up @@ -257,7 +257,7 @@ def test_retrieve_event_property_values(self):
self.assertEqual(toolkit.retrieve_event_property_values("event1", "bool"), "true")
self.assertEqual(
toolkit.retrieve_event_property_values("event1", "id"),
"9, 8, 7, 6, 5 and 5 more distinct values.",
"9, 8, 7, 6, 5, 4, 3, 2, 1, 0",
)
self.assertEqual(
toolkit.retrieve_event_property_values("event1", "date"), f'"{datetime(2024, 1, 1).isoformat()}"'
Expand Down
12 changes: 7 additions & 5 deletions ee/hogai/taxonomy_agent/toolkit.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

from pydantic import BaseModel, Field, RootModel

from posthog.taxonomy.taxonomy import CORE_FILTER_DEFINITIONS_BY_GROUP
from posthog.hogql.database.schema.channel_type import DEFAULT_CHANNEL_TYPES
from posthog.hogql_queries.ai.actors_property_taxonomy_query_runner import ActorsPropertyTaxonomyQueryRunner
from posthog.hogql_queries.ai.event_taxonomy_query_runner import EventTaxonomyQueryRunner
Expand All @@ -21,6 +20,7 @@
CachedEventTaxonomyQueryResponse,
EventTaxonomyQuery,
)
from posthog.taxonomy.taxonomy import CORE_FILTER_DEFINITIONS_BY_GROUP


class ToolkitTool(TypedDict):
Expand Down Expand Up @@ -274,7 +274,7 @@ def retrieve_event_properties(self, event_name: str) -> str:
"""
Retrieve properties for an event.
"""
runner = EventTaxonomyQueryRunner(EventTaxonomyQuery(event=event_name), self._team)
runner = EventTaxonomyQueryRunner(EventTaxonomyQuery(event=event_name, maxPropertyValues=25), self._team)
response = runner.run(ExecutionMode.RECENT_CACHE_CALCULATE_ASYNC_IF_STALE_AND_BLOCKING_ON_MISS)

if not isinstance(response, CachedEventTaxonomyQueryResponse):
Expand Down Expand Up @@ -336,7 +336,7 @@ def retrieve_event_property_values(self, event_name: str, property_name: str) ->
except PropertyDefinition.DoesNotExist:
return f"The property {property_name} does not exist in the taxonomy."

runner = EventTaxonomyQueryRunner(EventTaxonomyQuery(event=event_name), self._team)
runner = EventTaxonomyQueryRunner(EventTaxonomyQuery(event=event_name, maxPropertyValues=25), self._team)
response = runner.run(ExecutionMode.RECENT_CACHE_CALCULATE_ASYNC_IF_STALE_AND_BLOCKING_ON_MISS)

if not isinstance(response, CachedEventTaxonomyQueryResponse):
Expand Down Expand Up @@ -389,12 +389,14 @@ def retrieve_entity_property_values(self, entity: str, property_name: str) -> st
return self._retrieve_session_properties(property_name)

if entity == "person":
query = ActorsPropertyTaxonomyQuery(property=property_name)
query = ActorsPropertyTaxonomyQuery(property=property_name, maxPropertyValues=25)
else:
group_index = next((group.group_type_index for group in self._groups if group.group_type == entity), None)
if group_index is None:
return f"The entity {entity} does not exist in the taxonomy."
query = ActorsPropertyTaxonomyQuery(group_type_index=group_index, property=property_name)
query = ActorsPropertyTaxonomyQuery(
group_type_index=group_index, property=property_name, maxPropertyValues=25
)

try:
if query.group_type_index is not None:
Expand Down
6 changes: 6 additions & 0 deletions frontend/src/queries/schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@
"const": "ActorsPropertyTaxonomyQuery",
"type": "string"
},
"maxPropertyValues": {
"$ref": "#/definitions/integer"
},
"modifiers": {
"$ref": "#/definitions/HogQLQueryModifiers",
"description": "Modifiers used when performing the query"
Expand Down Expand Up @@ -6267,6 +6270,9 @@
"const": "EventTaxonomyQuery",
"type": "string"
},
"maxPropertyValues": {
"$ref": "#/definitions/integer"
},
"modifiers": {
"$ref": "#/definitions/HogQLQueryModifiers",
"description": "Modifiers used when performing the query"
Expand Down
2 changes: 2 additions & 0 deletions frontend/src/queries/schema/schema-general.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2158,6 +2158,7 @@ export interface EventTaxonomyQuery extends DataNode<EventTaxonomyQueryResponse>
kind: NodeKind.EventTaxonomyQuery
event: string
properties?: string[]
maxPropertyValues?: integer
}

export type EventTaxonomyQueryResponse = AnalyticsQueryResponseBase<EventTaxonomyResponse>
Expand All @@ -2175,6 +2176,7 @@ export interface ActorsPropertyTaxonomyQuery extends DataNode<ActorsPropertyTaxo
kind: NodeKind.ActorsPropertyTaxonomyQuery
property: string
group_type_index?: integer
maxPropertyValues?: integer
}

export type ActorsPropertyTaxonomyQueryResponse = AnalyticsQueryResponseBase<ActorsPropertyTaxonomyResponse>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,11 @@ def calculate(self):
def to_query(self) -> ast.SelectQuery | ast.SelectSetQuery:
query = ast.SelectQuery(
select=[
ast.Call(name="groupArray", args=[ast.Field(chain=["prop"])], params=[ast.Constant(value=5)]),
ast.Call(
name="groupArray",
args=[ast.Field(chain=["prop"])],
params=[ast.Constant(value=self.query.maxPropertyValues or 5)],
),
ast.Call(name="count", args=[]),
],
select_from=ast.JoinExpr(table=self._get_subquery()),
Expand Down
13 changes: 10 additions & 3 deletions posthog/hogql_queries/ai/event_taxonomy_query_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,15 @@ def calculate(self):
)

def to_query(self) -> ast.SelectQuery | ast.SelectSetQuery:
count_expr = ast.Constant(value=self.query.maxPropertyValues or 5)

if not self.query.properties:
return parse_select(
"""
SELECT
key,
-- Pick five latest distinct sample values.
arraySlice(arrayDistinct(groupArray(value)), 1, 5) AS values,
arraySlice(arrayDistinct(groupArray(value)), 1, {count}) AS values,
count(distinct value) AS total_count
FROM {from_query}
ARRAY JOIN kv.1 AS key, kv.2 AS value
Expand All @@ -65,21 +67,26 @@ def to_query(self) -> ast.SelectQuery | ast.SelectSetQuery:
ORDER BY total_count DESC
LIMIT 500
""",
placeholders={"from_query": self._get_subquery(), "filter": self._get_omit_filter()},
placeholders={
"from_query": self._get_subquery(),
"filter": self._get_omit_filter(),
"count": count_expr,
},
)

return parse_select(
"""
SELECT
key,
arraySlice(arrayDistinct(groupArray(value)), 1, 5) AS values,
arraySlice(arrayDistinct(groupArray(value)), 1, {count}) AS values,
count(DISTINCT value) AS total_count
FROM {from_query}
GROUP BY key
LIMIT 500
""",
placeholders={
"from_query": self._get_subquery(),
"count": count_expr,
},
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,33 @@
max_bytes_before_external_group_by=0
'''
# ---
# name: TestActorsPropertyTaxonomyQueryRunner.test_max_value_count
'''
SELECT groupArray(1)(prop), count()
FROM
(SELECT DISTINCT persons.properties___age AS prop
FROM
(SELECT person.id AS id,
replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, 'age'), ''), 'null'), '^"|"$', '') AS properties___age,
toTimeZone(person.created_at, 'UTC') AS created_at
FROM person
WHERE and(equals(person.team_id, 99999), ifNull(in(tuple(person.id, person.version),
(SELECT person.id AS id, max(person.version) AS version
FROM person
WHERE equals(person.team_id, 99999)
GROUP BY person.id
HAVING and(ifNull(equals(argMax(person.is_deleted, person.version), 0), 0), ifNull(less(argMax(toTimeZone(person.created_at, 'UTC'), person.version), plus(now64(6, 'UTC'), toIntervalDay(1))), 0)))), 0)) SETTINGS optimize_aggregation_in_order=1) AS persons
WHERE isNotNull(prop)
ORDER BY persons.created_at DESC)
LIMIT 100 SETTINGS readonly=2,
max_execution_time=60,
allow_experimental_object_type=1,
format_csv_allow_double_quotes=0,
max_ast_elements=4000000,
max_expanded_ast_elements=4000000,
max_bytes_before_external_group_by=0
'''
# ---
# name: TestActorsPropertyTaxonomyQueryRunner.test_person_property_taxonomy_query_runner
'''
SELECT groupArray(5)(prop), count()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,28 @@ def test_group_property_taxonomy_query_runner(self):
self.assertEqual(results.results.sample_count, 1)
# Ensure the value is a string
self.assertEqual(results.results.sample_values[0], "30")

@snapshot_clickhouse_queries
def test_max_value_count(self):
_create_person(
distinct_ids=["person1"],
properties={"age": 29},
team=self.team,
)
_create_person(
distinct_ids=["person2"],
properties={"age": 30},
team=self.team,
)
_create_person(
distinct_ids=["person3"],
properties={"age": 31},
team=self.team,
)

# regular person property
results = ActorsPropertyTaxonomyQueryRunner(
team=self.team, query=ActorsPropertyTaxonomyQuery(property="age", maxPropertyValues=1)
).calculate()
self.assertEqual(len(results.results.sample_values), 1)
self.assertEqual(results.results.sample_count, 3)
33 changes: 33 additions & 0 deletions posthog/hogql_queries/ai/test/test_event_taxonomy_query_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,3 +411,36 @@ def test_property_taxonomy_includes_events_with_partial_property_matches(self):
self.assertEqual(response.results[1].property, "$host")
self.assertEqual(response.results[1].sample_values, ["us.posthog.com"])
self.assertEqual(response.results[1].sample_count, 1)

def test_query_count(self):
_create_person(
distinct_ids=["person1"],
properties={"email": "[email protected]"},
team=self.team,
)
_create_event(
event="event1",
distinct_id="person1",
properties={"prop": "1"},
team=self.team,
)
_create_event(
event="event1",
distinct_id="person2",
properties={"prop": "2"},
team=self.team,
)
_create_event(
event="event1",
distinct_id="person2",
properties={"prop": "3"},
team=self.team,
)

response = EventTaxonomyQueryRunner(
team=self.team, query=EventTaxonomyQuery(event="event1", properties=["prop"], maxPropertyValues=1)
).calculate()
self.assertEqual(len(response.results), 1)
self.assertEqual(response.results[0].property, "prop")
self.assertEqual(response.results[0].sample_count, 3)
self.assertEqual(len(response.results[0].sample_values), 1)
2 changes: 2 additions & 0 deletions posthog/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -5824,6 +5824,7 @@ class ActorsPropertyTaxonomyQuery(BaseModel):
)
group_type_index: Optional[int] = None
kind: Literal["ActorsPropertyTaxonomyQuery"] = "ActorsPropertyTaxonomyQuery"
maxPropertyValues: Optional[int] = None
modifiers: Optional[HogQLQueryModifiers] = Field(
default=None, description="Modifiers used when performing the query"
)
Expand Down Expand Up @@ -6157,6 +6158,7 @@ class EventTaxonomyQuery(BaseModel):
)
event: str
kind: Literal["EventTaxonomyQuery"] = "EventTaxonomyQuery"
maxPropertyValues: Optional[int] = None
modifiers: Optional[HogQLQueryModifiers] = Field(
default=None, description="Modifiers used when performing the query"
)
Expand Down

0 comments on commit 00f2496

Please sign in to comment.