From 46a5fbcb6f1c73db68453c7a627fd03ab39b67d9 Mon Sep 17 00:00:00 2001
From: rderbier <rderbier@gmail.com>
Date: Tue, 24 Sep 2024 10:00:22 -0700
Subject: [PATCH] add social data and generator

---
 data/social/README.md       | 133 ++++++++++++++++++++++++++++++++++++
 data/social/contacts.rdf.gz |   3 +
 data/social/contacts.schema |   5 ++
 data/social/generate.py     |  53 ++++++++++++++
 4 files changed, 194 insertions(+)
 create mode 100644 data/social/README.md
 create mode 100644 data/social/contacts.rdf.gz
 create mode 100644 data/social/contacts.schema
 create mode 100644 data/social/generate.py
diff --git a/data/social/README.md b/data/social/README.md
new file mode 100644
index 0000000..6f3c3fa
--- /dev/null
+++ b/data/social/README.md
@@ -0,0 +1,133 @@
+
+Dataset created to support the blog post about variable propagation on 'social' use cases.
+
+https://dgraph.io/blog/post/20240923-variable-propgation/
+
+
+The RDF file `contacts.rdf.gz` has been generated for 10000 users with the script generate.py. 
+
+
+Start the dgraph container with the following command
+
+> docker run -it -d -p 8080:8080 -p 9080:9080 -v /path/to/dgraph-data:/dgraph --name dgraph-dev dgraph/standalone:latest
+
+Copy the files to the mounted directory so that they are seen in Docker.
+
+> cp contacts.rdf.gz <local path to /dgraph-data>
+> cp contacts.schema <local path to /dgraph-data>
+
+Use dgraph live command in the docker instance
+
+> docker exec -it dgraph-dev dgraph live -c 1 -f /dgraph/contacts.rdf.gz -s /dgraph/contacts.schema
+
+You cat get some usernames 
+
+```graphql
+{
+  user(func:has(username), first:5) {
+    username
+}
+```
+
+and test the queries from the blog post
+## mutual 'follows'
+```graphql
+{
+
+userA as var(func: eq(username, "barbara10")) { 
+  # use a named variable userA to be able to exclude this node later in the query
+    c as math(1) # start c =1 on user A Node
+    follows_of_userA as follows {
+        # c is propagated, each follow is reached one time so c =1 for every follow
+      ~follows @filter(NOT uid(userA)) {
+        # ~follows is the reverse relationship
+        # users at this level are reached by all the common follows, 
+        # c = sum all path = count of common follows
+        # keep the value in a variable, 
+        # in Dgraph a variable is a map uid -> value, so we have the count for every target
+                mutual_follows as math(c)
+      }
+    }
+  }
+    
+  target_user(func: uid(mutual_follows), orderdesc: val(mutual_follows), first:1) {
+    username
+    mutual_follows_count: val(mutual_follows)
+    mutual_follows: follows @filter(uid(follows_of_userA)) {
+      username
+    }
+  }
+}
+```
+
+## mutual 'contacts'
+```graphql
+{
+  var(func: eq(username, "barbara10")) {
+    c as math(1)
+    userA_phone_number as ~belongs_to {
+      userA_contacts as has_in_contacts {
+        ~has_in_contacts @filter(NOT uid(userA_phone_number)) {
+          belongs_to{
+              mutual_contacts as Math(c)
+          }
+        }
+      }
+    }
+  }
+  
+  
+  target_user(func: uid(mutual_contacts), orderdesc: val(mutual_contacts), first: 1) {
+    username
+    mutual_contact_count:val(mutual_contacts)
+    phone:~belongs_to {
+      phone_number
+      mutual_contacts: has_in_contacts @filter(uid(userA_contacts))  {
+        phone_number 
+        belongs_to {
+          username
+        }
+      }
+    }
+  }
+}
+```
+
+## computing a complex score
+
+```graphql
+{
+  userA as var(func: eq(username, "barbara10")) { 
+    c as math(1) # start c =1 on user A Node
+    # first block to compute mutual follows using variable propagation
+    follows {
+      ~follows @filter(NOT uid(userA)) {
+                mutual_follows as math(c)
+      }
+    }
+    # second block to compute mutual contacts using same variable !
+    #  different path.
+    userA_phone_number as ~belongs_to {
+      has_in_contacts {
+        ~has_in_contacts @filter(NOT uid(userA_phone_number)) {
+          belongs_to{
+              mutual_contacts as Math(c)
+          }
+        }
+      }
+    }
+  }
+
+# compute a score using the formula
+  var(func: uid(mutual_follows, mutual_contacts)) {
+    score as math(0.4 * mutual_follows + 0.6 * mutual_contacts)
+  }
+# get target info
+  target(func: uid(score), orderdesc: val(score), first: 1) {
+    username
+    score: val(score)
+    count_mutual_follows: val(mutual_follows)
+    count_mutual_contacts: val (mutual_contacts)
+  }
+}
+```
\ No newline at end of file
diff --git a/data/social/contacts.rdf.gz b/data/social/contacts.rdf.gz
new file mode 100644
index 0000000..e00474c
--- /dev/null
+++ b/data/social/contacts.rdf.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:918b231caa96cb00c2c1f2225670dc5f5834403d98b27151a4927bde2c389175
+size 8036100
diff --git a/data/social/contacts.schema b/data/social/contacts.schema
new file mode 100644
index 0000000..214b9e1
--- /dev/null
+++ b/data/social/contacts.schema
@@ -0,0 +1,5 @@
+phone_number: string @index(hash) .
+username: string @index(hash) .
+has_in_contacts: [uid] @reverse .
+belongs_to: uid @reverse .
+follows: [uid] @reverse .
\ No newline at end of file
diff --git a/data/social/generate.py b/data/social/generate.py
new file mode 100644
index 0000000..5be1963
--- /dev/null
+++ b/data/social/generate.py
@@ -0,0 +1,53 @@
+#!/usr/bin/python
+
+from faker import Faker
+import pandas as pd 
+import gzip
+import sys
+from random import randint
+
+
+faker = Faker()
+def generate_data(size=500):
+    # return a dataframe with user_name and phone number
+    phones = [f'{faker.unique.msisdn()[4:]}' for i in range(size)]
+    names = [faker.unique.user_name() for i in range(size)]
+    df = pd.DataFrame({'user_name': names, 'phone_number': phones})
+    return df
+
+
+def dataframe_to_rdf(data, filehandle = sys.stdout):
+    for _, row in data.iterrows():
+        # add users and phone numbers to the rdf file
+        rdf= ""
+        rdf += "<_:{}> <phone_number> \"{}\" .\n".format(row['phone_number'],row['phone_number'])
+        rdf += "<_:{}> <username> \"{}\" .\n".format(row['user_name'],row['user_name'])
+        rdf += "<_:{}> <belongs_to> <_:{}> .\n".format(row['phone_number'],row['user_name'])
+        # add follows relationship
+        # get a random number of people to follow from the dataframe
+        follows = data.sample(n=randint(5, 100))
+        for _, row_target in follows.iterrows():
+            if (row['user_name'] != row_target['user_name']):
+                rdf += "<_:{}> <follows> <_:{}> .\n".format(row['user_name'],row_target['user_name'])
+        # add contacts relationship
+        contacts = data.sample(n=randint(5, 100))
+        for _, row_target in contacts.iterrows():
+            if (row['phone_number'] != row_target['phone_number']):
+                rdf += "<_:{}> <has_in_contacts> <_:{}> .\n".format(row['phone_number'],row_target['phone_number'])
+        filehandle.write(rdf)
+    return
+
+
+data = generate_data(10000)
+# data.to_csv("products_with_embedding.csv.gz",index=False,compression='gzip',header=True)
+# gzip file must use wt for write text
+with gzip.open("./contacts.rdf.gz","wt") as f:
+    dataframe_to_rdf(data, f)
+
+
+# ## load data set
+# Start the dgraph container with the following command
+# docker run -it -d -p 8080:8080 -p 9080:9080 -v /path/to/dgraph-data:/dgraph --name dgraph-v24 dgraph/standalone:latest
+# cp contacts.rdf.gz <local path to /dgraph-data>
+# cp contacts.schema <local path to /dgraph-data>
+# docker exec -it dgraph-v24 dgraph live -c 1 -f /dgraph/contacts.rdf.gz -s /dgraph/contacts.schema
\ No newline at end of file