diff --git a/Cargo.toml b/Cargo.toml index 19c6965..46f3305 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "cleora-python" +name = "pycleora" version = "2.0.0" edition = "2018" license-file = "LICENSE" @@ -8,7 +8,7 @@ documentation = "https://github.com/synerise/cleora" homepage = "https://github.com/synerise/cleora" repository = "https://github.com/synerise/cleora" description = """ -Sparse graph structure and markov-propagation on embeddings exposed via python bindings +Sparse hypergraph structure and markov-propagation for node embeddings embeddings exposed via Python bindings. """ [lib] diff --git a/README.md b/README.md index a65ec06..250b281 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,81 @@ _**Cleora** is a genus of moths in the family **Geometridae**. Their scientific Cleora is a general-purpose model for efficient, scalable learning of stable and inductive entity embeddings for heterogeneous relational data. +**Cleora** is now available as a python package _pycleora_. Key improvements compared to the previous version: +* _performance optimizations_: 10x faster embedding times +* _performance optimizations_: reduced memory usage +* _latest research_: significantly improved embedding quality +* _new feature_: can create graphs from a Python iterator in addition to tsv files +* _new feature_: seamless integration with _NumPy_ +* _new feature_: item attributes support via custom embeddings initialization +* _new feature_: adjustable vector projection / normalization after each propagation step + +**Breaking changes:** +* _transient_ modifier not supported any more - creating _complex::reflexive_ columns for hypergraph embeddings, grouped by the transient entity gives better results. + + +**Example usage:** + +``` +import pycleora +import numpy as np +import pandas as pd +import random + +# Generate example data +customers = [f"Customer_{i}" for i in range(1, 20)] +products = [f"Product_{j}" for j in range(1, 20)] + +data = { + "customer": random.choices(customers, k=100), + "product": random.choices(products, k=100), +} + +# Create DataFrame +df = pd.DataFrame(data) + +# Create hyperedges +customer_products = df.groupby('customer')['product'].apply(list).values + +# Convert to Cleora input format +cleora_input = map(lambda x: ' '.join(x), customer_products) + +# Create Markov transition matrix for the hypergraph +mat = pycleora.SparseMatrix.from_iterator(cleora_input, columns='complex::reflexive::product') + +# Look at entity ids in the matrix, corresponding to embedding vectors +print(mat.entity_ids) +# ['Product_5', 'Product_3', 'Product_2', 'Product_4', 'Product_1'] + +# Initialize embedding vectors externally, using text, image, random vectors +# embeddings = ... + +# Or use built-in random deterministic initialization +embeddings = mat.initialize_deterministically(1024) + +# Perform Markov random walk, then normalize however many times we want + +NUM_WALKS = 3 # The optimal number depends on the graph, typically between 3 and 7 yields good results + # lower values tend to capture co-occurrence, higher iterations capture substitutability in a context + +for i in range(NUM_WALKS): + # Can propagate with a symmetric matrix as well, but left Markov is a great default + embeddings = mat.left_markov_propagate(embeddings) + # Normalize with L2 norm by default, for the embeddings to reside on a hypersphere. Can use standardization instead. + embeddings /= np.linalg.norm(embeddings, ord=2, axis=-1, keepdims=True) + +# We're done, here are our embeddings + +for entity, embedding in zip(mat.entity_ids, embeddings): + print(entity, embedding) + +# We can now compare our embeddings with dot product (since they are L2 normalized) + +print(np.dot(embeddings[0], embeddings[1])) +print(np.dot(embeddings[0], embeddings[2])) +print(np.dot(embeddings[0], embeddings[3])) +``` + **Read the whitepaper ["Cleora: A Simple, Strong and Scalable Graph Embedding Scheme"](https://arxiv.org/abs/2102.02302)** Cleora embeds entities in *n-dimensional spherical spaces* utilizing extremely fast stable, iterative random projections, which allows for unparalleled performance and scalability. @@ -166,14 +241,6 @@ The technical properties described above imply good production-readiness of Cleo More information can be found in [the full documentation](https://cleora.readthedocs.io/). -## Cleora Enterprise -**Cleora Enterprise** is now available for selected customers. Key improvements in addition to this open-source version: -* _performance optimizations_: 10x faster embedding times -* _latest research_: significantly improved embedding quality -* _new feature_: item attributes support -* _new feature_: multimodal fusion of multiple graphs, text and image embeddings -* _new feature_: compressed embeddings in various formats (spherical, hyperbolic, sparse) - For details contact us at cleora@synerise.com ## Cite diff --git a/examples/cleora_loop.py b/examples/cleora_loop.py index 7ecdebb..9e51917 100644 --- a/examples/cleora_loop.py +++ b/examples/cleora_loop.py @@ -1,7 +1,7 @@ import time import numpy as np -from cleora_python import SparseMatrix +from pycleora import SparseMatrix start_time = time.time() diff --git a/examples/column_indices.py b/examples/column_indices.py index 540e86e..2ed2fb7 100644 --- a/examples/column_indices.py +++ b/examples/column_indices.py @@ -1,5 +1,5 @@ import numpy as np -from cleora_python import SparseMatrix +from pycleora import SparseMatrix hyperedges = [ 'a\t1', diff --git a/examples/from_iterator.py b/examples/from_iterator.py index 2525f4d..2c008ea 100644 --- a/examples/from_iterator.py +++ b/examples/from_iterator.py @@ -1,7 +1,7 @@ import time import numpy as np -from cleora_python import SparseMatrix +from pycleora import SparseMatrix start_time = time.time() diff --git a/examples/graph_pickle.py b/examples/graph_pickle.py index e0c9cf3..7805b28 100644 --- a/examples/graph_pickle.py +++ b/examples/graph_pickle.py @@ -1,7 +1,7 @@ import time import numpy as np -from cleora_python import SparseMatrix +from pycleora import SparseMatrix import pickle diff --git a/examples/predefined_cleora_loop.py b/examples/predefined_cleora_loop.py index 9571bf2..5732714 100644 --- a/examples/predefined_cleora_loop.py +++ b/examples/predefined_cleora_loop.py @@ -1,6 +1,6 @@ import time -from cleora_python import embed_using_baseline_cleora, SparseMatrix +from pycleora import embed_using_baseline_cleora, SparseMatrix start_time = time.time() graph = SparseMatrix.from_files(["perf_inputs/0.tsv", "perf_inputs/1.tsv", "perf_inputs/2.tsv", "perf_inputs/3.tsv", "perf_inputs/4.tsv", "perf_inputs/5.tsv", "perf_inputs/6.tsv", "perf_inputs/7.tsv"], "complex::reflexive::name") diff --git a/cleora_python/.gitignore b/pycleora/.gitignore similarity index 100% rename from cleora_python/.gitignore rename to pycleora/.gitignore diff --git a/cleora_python/__init__.py b/pycleora/__init__.py similarity index 90% rename from cleora_python/__init__.py rename to pycleora/__init__.py index ecf4112..1c6ad89 100644 --- a/cleora_python/__init__.py +++ b/pycleora/__init__.py @@ -1,6 +1,6 @@ import numpy as np -from .cleora import SparseMatrix +from .pycleora import SparseMatrix def embed_using_baseline_cleora(graph, feature_dim: int, iter: int): embeddings = graph.initialize_deterministically(feature_dim) diff --git a/pycleora/cleora_python.cpython-39-x86_64-linux-gnu.so b/pycleora/cleora_python.cpython-39-x86_64-linux-gnu.so new file mode 100755 index 0000000..833ba25 Binary files /dev/null and b/pycleora/cleora_python.cpython-39-x86_64-linux-gnu.so differ diff --git a/pycleora/pycleora.cpython-39-x86_64-linux-gnu.so b/pycleora/pycleora.cpython-39-x86_64-linux-gnu.so new file mode 100755 index 0000000..ef08a03 Binary files /dev/null and b/pycleora/pycleora.cpython-39-x86_64-linux-gnu.so differ diff --git a/pyproject.toml b/pyproject.toml index a2caae5..b395b0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,13 +3,20 @@ requires = ["maturin>=1.2.3"] build-backend = "maturin" [project] -name = "cleora_python" +name = "pycleora" requires-python = ">=3.7" classifiers = [ "Programming Language :: Rust", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] +version = "2.0.0" +description = "Sparse hypergraph structure and markov-propagation for node embeddings embeddings exposed via Python bindings." +readme = { file = "README.md", content-type = "text/markdown" } +authors = [ + { name = "Jacek Dabrowski", email = "jack.dabrowski@synerise.com" } +] +license = { file = "LICENSE" } [tool.maturin] diff --git a/src/lib.rs b/src/lib.rs index 8134841..f2508ab 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -234,7 +234,7 @@ fn init_value(col: usize, hsh: u64, fixed_random_value: i64) -> f32 { } #[pymodule] -#[pyo3(name = "cleora")] +#[pyo3(name = "pycleora")] fn pycleora(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; Ok(()) diff --git a/src/sparse_matrix_builder.rs b/src/sparse_matrix_builder.rs index bbf7748..049d478 100644 --- a/src/sparse_matrix_builder.rs +++ b/src/sparse_matrix_builder.rs @@ -263,7 +263,7 @@ impl SparseMatrixBuffer { fn update_row(&mut self, hash: u64, count: u32) { let val = 1f32 / (count as f32); - let mut e = self.hash_2_row.entry(hash).or_default(); + let e = self.hash_2_row.entry(hash).or_default(); e.occurrence += count; e.row_sum += val }