-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadd_data.py
95 lines (80 loc) · 2.53 KB
/
add_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import sys
sys.path.append("")
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
es = Elasticsearch("http://localhost:9200")
es.info().body
df = (
pd.read_csv("data/wiki_movie_plots_deduped.csv")
.dropna()
.reset_index()
)
# Define the mappings with the updated analyzers for 'director' and 'cast'
mappings = {
"properties": {
"title": {"type": "text", "analyzer": "english"},
"ethnicity": {"type": "text", "analyzer": "standard"},
"director": {"type": "text", "analyzer": "english"}, # Changed to 'english'
"cast": {"type": "text", "analyzer": "english"}, # Changed to 'english'
"genre": {"type": "text", "analyzer": "standard"},
"plot": {"type": "text", "analyzer": "english"},
"year": {"type": "integer"},
"wiki_page": {"type": "keyword"}
}
}
# Recreate the index with the updated mappings
# Delete the index if it exists
es.indices.delete(index="movies")
es.indices.create(index="movies", mappings=mappings) # Create the index with updated mappings
# Insert documents
for i, row in df.iterrows():
doc = {
"title": row["Title"],
"ethnicity": row["Origin/Ethnicity"],
"director": row["Director"],
"cast": row["Cast"],
"genre": row["Genre"],
"plot": row["Plot"],
"year": row["Release Year"],
"wiki_page": row["Wiki Page"]
}
es.index(index="movies", id=i, document=doc)
# Bulk indexing
bulk_data = []
for i, row in df.iterrows():
bulk_data.append(
{
"_index": "movies",
"_id": i,
"_source": {
"title": row["Title"],
"ethnicity": row["Origin/Ethnicity"],
"director": row["Director"],
"cast": row["Cast"],
"genre": row["Genre"],
"plot": row["Plot"],
"year": row["Release Year"],
"wiki_page": row["Wiki Page"],
}
}
)
bulk(es, bulk_data)
# Refresh the index to make the data available for querying
es.indices.refresh(index="movies")
es.cat.count(index="movies", format="json")
# Sample query for 'cast' and 'director'
resp = es.search(
index="movies",
query={
"bool": {
"must": {
"match_phrase": {
"cast": "jack nicholson",
}
},
"filter": {"bool": {"must_not": {"match_phrase": {"director": "roman polanski"}}}},
},
},
)
print('res', resp.body)