From 46a5fbcb6f1c73db68453c7a627fd03ab39b67d9 Mon Sep 17 00:00:00 2001 From: rderbier Date: Tue, 24 Sep 2024 10:00:22 -0700 Subject: [PATCH] add social data and generator --- data/social/README.md | 133 ++++++++++++++++++++++++++++++++++++ data/social/contacts.rdf.gz | 3 + data/social/contacts.schema | 5 ++ data/social/generate.py | 53 ++++++++++++++ 4 files changed, 194 insertions(+) create mode 100644 data/social/README.md create mode 100644 data/social/contacts.rdf.gz create mode 100644 data/social/contacts.schema create mode 100644 data/social/generate.py diff --git a/data/social/README.md b/data/social/README.md new file mode 100644 index 0000000..6f3c3fa --- /dev/null +++ b/data/social/README.md @@ -0,0 +1,133 @@ + +Dataset created to support the blog post about variable propagation on 'social' use cases. + +https://dgraph.io/blog/post/20240923-variable-propgation/ + + +The RDF file `contacts.rdf.gz` has been generated for 10000 users with the script generate.py. + + +Start the dgraph container with the following command + +> docker run -it -d -p 8080:8080 -p 9080:9080 -v /path/to/dgraph-data:/dgraph --name dgraph-dev dgraph/standalone:latest + +Copy the files to the mounted directory so that they are seen in Docker. + +> cp contacts.rdf.gz +> cp contacts.schema + +Use dgraph live command in the docker instance + +> docker exec -it dgraph-dev dgraph live -c 1 -f /dgraph/contacts.rdf.gz -s /dgraph/contacts.schema + +You cat get some usernames + +```graphql +{ + user(func:has(username), first:5) { + username +} +``` + +and test the queries from the blog post +## mutual 'follows' +```graphql +{ + +userA as var(func: eq(username, "barbara10")) { + # use a named variable userA to be able to exclude this node later in the query + c as math(1) # start c =1 on user A Node + follows_of_userA as follows { + # c is propagated, each follow is reached one time so c =1 for every follow + ~follows @filter(NOT uid(userA)) { + # ~follows is the reverse relationship + # users at this level are reached by all the common follows, + # c = sum all path = count of common follows + # keep the value in a variable, + # in Dgraph a variable is a map uid -> value, so we have the count for every target + mutual_follows as math(c) + } + } + } + + target_user(func: uid(mutual_follows), orderdesc: val(mutual_follows), first:1) { + username + mutual_follows_count: val(mutual_follows) + mutual_follows: follows @filter(uid(follows_of_userA)) { + username + } + } +} +``` + +## mutual 'contacts' +```graphql +{ + var(func: eq(username, "barbara10")) { + c as math(1) + userA_phone_number as ~belongs_to { + userA_contacts as has_in_contacts { + ~has_in_contacts @filter(NOT uid(userA_phone_number)) { + belongs_to{ + mutual_contacts as Math(c) + } + } + } + } + } + + + target_user(func: uid(mutual_contacts), orderdesc: val(mutual_contacts), first: 1) { + username + mutual_contact_count:val(mutual_contacts) + phone:~belongs_to { + phone_number + mutual_contacts: has_in_contacts @filter(uid(userA_contacts)) { + phone_number + belongs_to { + username + } + } + } + } +} +``` + +## computing a complex score + +```graphql +{ + userA as var(func: eq(username, "barbara10")) { + c as math(1) # start c =1 on user A Node + # first block to compute mutual follows using variable propagation + follows { + ~follows @filter(NOT uid(userA)) { + mutual_follows as math(c) + } + } + # second block to compute mutual contacts using same variable ! + # different path. + userA_phone_number as ~belongs_to { + has_in_contacts { + ~has_in_contacts @filter(NOT uid(userA_phone_number)) { + belongs_to{ + mutual_contacts as Math(c) + } + } + } + } + } + +# compute a score using the formula + var(func: uid(mutual_follows, mutual_contacts)) { + score as math(0.4 * mutual_follows + 0.6 * mutual_contacts) + } +# get target info + target(func: uid(score), orderdesc: val(score), first: 1) { + username + score: val(score) + count_mutual_follows: val(mutual_follows) + count_mutual_contacts: val (mutual_contacts) + } +} +``` \ No newline at end of file diff --git a/data/social/contacts.rdf.gz b/data/social/contacts.rdf.gz new file mode 100644 index 0000000..e00474c --- /dev/null +++ b/data/social/contacts.rdf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:918b231caa96cb00c2c1f2225670dc5f5834403d98b27151a4927bde2c389175 +size 8036100 diff --git a/data/social/contacts.schema b/data/social/contacts.schema new file mode 100644 index 0000000..214b9e1 --- /dev/null +++ b/data/social/contacts.schema @@ -0,0 +1,5 @@ +phone_number: string @index(hash) . +username: string @index(hash) . +has_in_contacts: [uid] @reverse . +belongs_to: uid @reverse . +follows: [uid] @reverse . \ No newline at end of file diff --git a/data/social/generate.py b/data/social/generate.py new file mode 100644 index 0000000..5be1963 --- /dev/null +++ b/data/social/generate.py @@ -0,0 +1,53 @@ +#!/usr/bin/python + +from faker import Faker +import pandas as pd +import gzip +import sys +from random import randint + + +faker = Faker() +def generate_data(size=500): + # return a dataframe with user_name and phone number + phones = [f'{faker.unique.msisdn()[4:]}' for i in range(size)] + names = [faker.unique.user_name() for i in range(size)] + df = pd.DataFrame({'user_name': names, 'phone_number': phones}) + return df + + +def dataframe_to_rdf(data, filehandle = sys.stdout): + for _, row in data.iterrows(): + # add users and phone numbers to the rdf file + rdf= "" + rdf += "<_:{}> \"{}\" .\n".format(row['phone_number'],row['phone_number']) + rdf += "<_:{}> \"{}\" .\n".format(row['user_name'],row['user_name']) + rdf += "<_:{}> <_:{}> .\n".format(row['phone_number'],row['user_name']) + # add follows relationship + # get a random number of people to follow from the dataframe + follows = data.sample(n=randint(5, 100)) + for _, row_target in follows.iterrows(): + if (row['user_name'] != row_target['user_name']): + rdf += "<_:{}> <_:{}> .\n".format(row['user_name'],row_target['user_name']) + # add contacts relationship + contacts = data.sample(n=randint(5, 100)) + for _, row_target in contacts.iterrows(): + if (row['phone_number'] != row_target['phone_number']): + rdf += "<_:{}> <_:{}> .\n".format(row['phone_number'],row_target['phone_number']) + filehandle.write(rdf) + return + + +data = generate_data(10000) +# data.to_csv("products_with_embedding.csv.gz",index=False,compression='gzip',header=True) +# gzip file must use wt for write text +with gzip.open("./contacts.rdf.gz","wt") as f: + dataframe_to_rdf(data, f) + + +# ## load data set +# Start the dgraph container with the following command +# docker run -it -d -p 8080:8080 -p 9080:9080 -v /path/to/dgraph-data:/dgraph --name dgraph-v24 dgraph/standalone:latest +# cp contacts.rdf.gz +# cp contacts.schema +# docker exec -it dgraph-v24 dgraph live -c 1 -f /dgraph/contacts.rdf.gz -s /dgraph/contacts.schema \ No newline at end of file