diff --git a/Cargo.lock b/Cargo.lock index d0cbde3..8e999d2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -806,7 +806,7 @@ dependencies = [ [[package]] name = "pycleora" -version = "3.2.0" +version = "3.2.1" dependencies = [ "bincode", "criterion", diff --git a/Cargo.toml b/Cargo.toml index b37b334..da7b6ec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pycleora" -version = "3.2.0" +version = "3.2.1" edition = "2018" license-file = "LICENSE" readme = "README.md" diff --git a/README.md b/README.md index c17bf17..24244c4 100644 --- a/README.md +++ b/README.md @@ -78,12 +78,14 @@ for r in similar: print(f"{r['entity_id']}: {r['similarity']:.4f}") ``` +`embed()` defaults to `feature_dim=256`, `num_iterations=40`, and whitening after every propagation step. + ### Step-by-Step Example The high-level `embed()` function wraps the Markov propagation loop for convenience. Here's the full manual version, which gives you complete control over the process: ```python -from pycleora import SparseMatrix +from pycleora import SparseMatrix, whiten_embeddings import numpy as np import pandas as pd import random @@ -111,6 +113,7 @@ NUM_ITERATIONS = 40 for i in range(NUM_ITERATIONS): embeddings = mat.left_markov_propagate(embeddings) embeddings /= np.linalg.norm(embeddings, ord=2, axis=-1, keepdims=True) + embeddings = whiten_embeddings(embeddings) for entity, embedding in zip(mat.entity_ids, embeddings): print(entity, embedding) @@ -155,7 +158,7 @@ Embeddings are stable across runs and support inductive learning: new nodes can | Algorithm | Type | Description | |-----------|------|-------------| -| **Cleora** | Spectral / Random Walk | Iterative Markov propagation with L2 normalization — all random walks in one matrix multiplication | +| **Cleora** | Spectral / Random Walk | Iterative Markov propagation with per-iteration whitening — all random walks in one matrix multiplication | | **ProNE** | Spectral | Fast spectral propagation with Chebyshev polynomial approximation | | **RandNE** | Random Projection | Gaussian random projection for very fast, approximate embeddings | | **NetMF** | Matrix Factorization | Network Matrix Factorization — factorizes the DeepWalk matrix explicitly | @@ -177,13 +180,13 @@ pycleora embed --input graph.tsv --output out.npz --algorithm node2vec Beyond the standard algorithms, Cleora supports several advanced embedding strategies: -- **Multiscale embeddings** — concatenates embeddings from different iteration depths (e.g. scales `[1, 2, 4, 8]`) to capture both local and global graph structure simultaneously +- **Multiscale embeddings** — concatenates embeddings from different iteration depths (e.g. scales `[10, 20, 30, 40]`) to capture both local and global graph structure simultaneously - **Attention-weighted propagation** — uses softmax-normalized dot-product attention during propagation, dynamically weighting neighbor contributions - **Supervised refinement** — fine-tunes unsupervised embeddings using positive/negative entity pairs with a triplet margin loss - **Directed graph embeddings** — handles asymmetric relationships where edge direction matters - **Weighted graph embeddings** — incorporates edge weights into the propagation step - **Node feature integration** — initializes embeddings with external features (text, image, numeric) before propagation -- **PCA whitening** — built-in ZCA whitening to decorrelate embedding dimensions and improve downstream task performance +- **PCA whitening** — built-in whitening after every iteration by default to decorrelate embedding dimensions and improve downstream task performance --- @@ -312,7 +315,7 @@ See [cleora.ai/use-cases](https://cleora.ai/use-cases) for detailed walkthroughs 2. **Hypergraph Construction** — Builds a heterogeneous hypergraph where a single edge can connect multiple entities of different types. 3. **Sparse Markov Matrix** — Constructs a sparse transition matrix (99%+ sparse). Rows normalized so each row sums to 1. 4. **Single Matrix Multiplication = All Walks** — One sparse matrix multiplication captures *every possible random walk* of a given length. No sampling, no noise. -5. **L2-Normalized Propagation** — Each iteration replaces every node's embedding with the L2-normalized average of its neighbors. 3-4 iterations for co-occurrence similarity, 7+ for contextual similarity. +5. **L2-Normalized + Whitened Propagation** — Each iteration replaces every node's embedding with the L2-normalized average of its neighbors and then whitens the embedding space. The default configuration runs 40 iterations at 256 dimensions. 6. **Embeddings Ready** — Dense, deterministic embedding vectors for every entity. Same input always yields same output. --- @@ -343,15 +346,15 @@ A: No, this is a methodologically wrong approach, stemming from outdated matrix **Q: What embedding dimensionality to use?** -A: The more, the better, but we typically work from _1024_ to _4096_. Memory is cheap and machines are powerful, so don't skimp on embedding size. +A: The default is **256**. For larger production systems we often work from _1024_ to _4096_, but `256` is the baseline shipped by the library. **Q: How many iterations of Markov propagation should I use?** -A: Depends on what you want to achieve. Low iterations (3) tend to approximate the co-occurrence matrix, while high iterations (7+) tend to give contextual similarity (think skip-gram but much more accurate and faster). +A: The default is **40** whitening-enhanced propagation steps. If you want more local, co-occurrence-style behavior you can dial that down manually; higher values bias more toward contextual similarity. **Q: How do I incorporate external information, e.g. entity metadata, images, texts into the embeddings?** -A: Just initialize the embedding matrix with your own vectors coming from a VIT, sentence-transformers, or a random projection of your numeric features. In that scenario low numbers of Markov iterations (1 to 3) tend to work best. +A: Just initialize the embedding matrix with your own vectors coming from a VIT, sentence-transformers, or a random projection of your numeric features. In that scenario fewer Markov iterations than the default `40` often work best. **Q: My embeddings don't fit in memory, what do I do?** @@ -367,7 +370,7 @@ A: Cleora works best for relatively sparse hypergraphs. If all your hyperedges c **Q: How can Cleora be so fast and accurate at the same time?** -A: Not using negative sampling is a great boon. By constructing the (sparse) Markov transition matrix, Cleora explicitly performs all possible random walks in a hypergraph in one big step (a single matrix multiplication). That's what we call a single _iteration_. We perform 3+ such iterations. Thanks to a highly efficient implementation in Rust, with special care for concurrency, memory layout and cache coherence, it is blazingly fast. Negative sampling or randomly selecting random walks tend to introduce a lot of noise - Cleora is free of those burdens. +A: Not using negative sampling is a great boon. By constructing the (sparse) Markov transition matrix, Cleora explicitly performs all possible random walks in a hypergraph in one big step (a single matrix multiplication). That's what we call a single _iteration_. The default configuration performs 40 such iterations with whitening after every step. Negative sampling or randomly selecting random walks tend to introduce a lot of noise - Cleora is free of those burdens. --- diff --git a/examples/cleora_loop.py b/examples/cleora_loop.py index 9e51917..8abb208 100644 --- a/examples/cleora_loop.py +++ b/examples/cleora_loop.py @@ -1,7 +1,7 @@ import time import numpy as np -from pycleora import SparseMatrix +from pycleora import SparseMatrix, whiten_embeddings start_time = time.time() @@ -9,14 +9,15 @@ graph = SparseMatrix.from_files(["perf_inputs/0.tsv", "perf_inputs/1.tsv", "perf_inputs/2.tsv", "perf_inputs/3.tsv", "perf_inputs/4.tsv", "perf_inputs/5.tsv", "perf_inputs/6.tsv", "perf_inputs/7.tsv"], "complex::reflexive::name") print("Entities n", len(graph.entity_ids)) -# embeddings = np.random.randn(len(graph.entity_ids), 128).astype(np.float32) -embeddings = graph.initialize_deterministically(feature_dim=128, seed=0) +# embeddings = np.random.randn(len(graph.entity_ids), 256).astype(np.float32) +embeddings = graph.initialize_deterministically(feature_dim=256, seed=0) -for i in range(3): +for i in range(40): embeddings = graph.left_markov_propagate(embeddings) # embeddings = graph.symmetric_markov_propagate(embeddings) embeddings /= np.linalg.norm(embeddings, ord=2, axis=-1, keepdims=True) + embeddings = whiten_embeddings(embeddings) print(f"Iter {i} finished") print(graph.entity_ids[:10]) diff --git a/examples/from_iterator.py b/examples/from_iterator.py index 2c008ea..0a3393d 100644 --- a/examples/from_iterator.py +++ b/examples/from_iterator.py @@ -1,7 +1,7 @@ import time import numpy as np -from pycleora import SparseMatrix +from pycleora import SparseMatrix, whiten_embeddings start_time = time.time() @@ -25,9 +25,10 @@ def edges_iterator(): embeddings = np.random.randn(len(graph.entity_ids), 256).astype(np.float32) -for i in range(3): +for i in range(40): embeddings = graph.left_markov_propagate(embeddings) embeddings /= np.linalg.norm(embeddings, ord=2, axis=-1, keepdims=True) + embeddings = whiten_embeddings(embeddings) print(f"Iter {i} finished") -print(f"Took {time.time() - start_time} seconds ") \ No newline at end of file +print(f"Took {time.time() - start_time} seconds ") diff --git a/examples/graph_pickle.py b/examples/graph_pickle.py index 7805b28..963c4f6 100644 --- a/examples/graph_pickle.py +++ b/examples/graph_pickle.py @@ -1,7 +1,7 @@ import time import numpy as np -from pycleora import SparseMatrix +from pycleora import SparseMatrix, whiten_embeddings import pickle @@ -21,7 +21,9 @@ print(graph.entity_ids[:10]) print(graph_reread.entity_ids[:10]) -embeddings = graph_reread.initialize_deterministically(feature_dim=128, seed=0) +embeddings = graph_reread.initialize_deterministically(feature_dim=256, seed=0) embeddings = graph_reread.left_markov_propagate(embeddings) +embeddings /= np.linalg.norm(embeddings, ord=2, axis=-1, keepdims=True) +embeddings = whiten_embeddings(embeddings) -print(embeddings) \ No newline at end of file +print(embeddings) diff --git a/pycleora/__init__.py b/pycleora/__init__.py index 84d6dc7..866a6df 100644 --- a/pycleora/__init__.py +++ b/pycleora/__init__.py @@ -9,12 +9,15 @@ from . import search from . import compress +DEFAULT_FEATURE_DIM = 256 +DEFAULT_NUM_ITERATIONS = 40 + def embed_using_baseline_cleora(graph, feature_dim: int, iter: int): embeddings = graph.initialize_deterministically(feature_dim) for i in range(iter): embeddings = graph.left_markov_propagate(embeddings) - embeddings /= np.linalg.norm(embeddings, ord=2, axis=-1, keepdims=True) + embeddings = _postprocess_iteration(embeddings, "l2", True) return embeddings @@ -42,18 +45,13 @@ def _to_scipy_sparse(graph: SparseMatrix, markov_type: str = "left"): def _auto_iterations(feature_dim: int) -> int: - if feature_dim <= 256: - return 4 - elif feature_dim <= 512: - return 8 - else: - return 16 + return DEFAULT_NUM_ITERATIONS def embed( graph: SparseMatrix, - feature_dim: int = 128, - num_iterations: Union[int, str] = 4, + feature_dim: int = DEFAULT_FEATURE_DIM, + num_iterations: Union[int, str] = DEFAULT_NUM_ITERATIONS, propagation: str = "left", normalization: str = "l2", seed: int = 0, @@ -73,6 +71,7 @@ def embed( initial_embeddings is None and callback is None and normalization == "l2" + and not whiten ) if use_fast_path: @@ -108,19 +107,22 @@ def embed( embeddings = graph.initialize_deterministically(feature_dim, seed) for i in range(num_iterations): + prev_embeddings = embeddings prev = embeddings if residual_weight > 0 else None embeddings = propagate_fn(embeddings, num_workers=num_workers) if residual_weight > 0 and prev is not None: embeddings = (1 - residual_weight) * embeddings + residual_weight * prev - embeddings = _normalize(embeddings, normalization) + embeddings = _postprocess_iteration(embeddings, normalization, whiten) if callback is not None: callback(i, embeddings) - if whiten: - embeddings = whiten_embeddings(embeddings) + if convergence_threshold > 0 and i > 0: + rmse = _compute_rmse(embeddings, prev_embeddings) + if rmse < convergence_threshold: + break return embeddings @@ -165,7 +167,7 @@ def whiten_embeddings(embeddings: np.ndarray, n_components: Optional[int] = None def embed_with_node_features( graph: SparseMatrix, node_features: Dict[str, np.ndarray], - num_iterations: int = 4, + num_iterations: int = DEFAULT_NUM_ITERATIONS, propagation: str = "left", normalization: str = "l2", feature_weight: float = 0.5, @@ -203,14 +205,15 @@ def embed_with_node_features( def embed_with_attention( graph: SparseMatrix, - feature_dim: int = 128, - num_iterations: int = 4, + feature_dim: int = DEFAULT_FEATURE_DIM, + num_iterations: int = DEFAULT_NUM_ITERATIONS, propagation: str = "left", normalization: str = "l2", attention_temperature: float = 1.0, seed: int = 0, num_workers: Optional[int] = None, callback: Optional[Callable[[int, np.ndarray], None]] = None, + whiten: bool = True, ) -> np.ndarray: _validate_propagation(propagation) @@ -224,7 +227,7 @@ def embed_with_attention( propagate_fn = _get_propagate_fn(graph, propagation) embeddings = propagate_fn(embeddings, num_workers=num_workers) - embeddings = _normalize(embeddings, normalization) + embeddings = _postprocess_iteration(embeddings, normalization, whiten) if callback is not None: callback(0, embeddings) @@ -265,7 +268,7 @@ def embed_with_attention( weighted_adj = diags(1.0 / row_sums_w) @ weighted_adj embeddings = (weighted_adj @ embeddings).astype(np.float32) - embeddings = _normalize(embeddings, normalization) + embeddings = _postprocess_iteration(embeddings, normalization, whiten) if callback is not None: callback(i, embeddings) @@ -275,7 +278,7 @@ def embed_with_attention( def embed_multiscale( graph: SparseMatrix, - feature_dim: int = 128, + feature_dim: int = DEFAULT_FEATURE_DIM, scales: List[int] = None, propagation: str = "left", normalization: str = "l2", @@ -286,7 +289,7 @@ def embed_multiscale( propagate_fn = _get_propagate_fn(graph, propagation) if scales is None: - scales = [1, 2, 4, 8] + scales = [10, 20, 30, 40] if not scales or not all(isinstance(s, int) and s > 0 for s in scales): raise ValueError("scales must be a non-empty list of positive integers") @@ -298,11 +301,9 @@ def embed_multiscale( for scale in sorted(scales): while current_iter < scale: embeddings = propagate_fn(embeddings, num_workers=num_workers) - embeddings = _normalize(embeddings, normalization) + embeddings = _postprocess_iteration(embeddings, normalization, whiten) current_iter += 1 snapshot = embeddings.copy() - if whiten: - snapshot = whiten_embeddings(snapshot) all_embeddings.append(snapshot) return np.concatenate(all_embeddings, axis=1) @@ -311,13 +312,14 @@ def embed_multiscale( def embed_weighted( edges_with_weights: List[Tuple[str, float]], columns: str, - feature_dim: int = 128, - num_iterations: int = 4, + feature_dim: int = DEFAULT_FEATURE_DIM, + num_iterations: int = DEFAULT_NUM_ITERATIONS, propagation: str = "left", normalization: str = "l2", seed: int = 0, hyperedge_trim_n: int = 16, num_workers: Optional[int] = None, + whiten: bool = True, ) -> Tuple[SparseMatrix, np.ndarray]: from scipy.sparse import csr_matrix, diags @@ -352,7 +354,7 @@ def embed_weighted( for i in range(num_iterations): embeddings = (weighted_adj @ embeddings).astype(np.float32) - embeddings = _normalize(embeddings, normalization) + embeddings = _postprocess_iteration(embeddings, normalization, whiten) return graph, embeddings @@ -360,12 +362,13 @@ def embed_weighted( def embed_directed( edges: List[str], columns: str, - feature_dim: int = 128, - num_iterations: int = 4, + feature_dim: int = DEFAULT_FEATURE_DIM, + num_iterations: int = DEFAULT_NUM_ITERATIONS, normalization: str = "l2", seed: int = 0, hyperedge_trim_n: int = 16, num_workers: Optional[int] = None, + whiten: bool = True, ) -> Tuple[SparseMatrix, np.ndarray]: from scipy.sparse import csr_matrix, diags @@ -402,7 +405,7 @@ def embed_directed( embeddings = graph.initialize_deterministically(feature_dim, seed) for i in range(num_iterations): embeddings = (adj @ embeddings).astype(np.float32) - embeddings = _normalize(embeddings, normalization) + embeddings = _postprocess_iteration(embeddings, normalization, whiten) return graph, embeddings @@ -540,7 +543,7 @@ def embed_inductive( existing_edges: List[str], new_edges: List[str], columns: str, - num_iterations: int = 4, + num_iterations: int = DEFAULT_NUM_ITERATIONS, propagation: str = "left", normalization: str = "l2", hyperedge_trim_n: int = 16, @@ -580,8 +583,8 @@ def embed_inductive( def embed_streaming( edge_batches, columns: str, - feature_dim: int = 128, - num_iterations: int = 4, + feature_dim: int = DEFAULT_FEATURE_DIM, + num_iterations: int = DEFAULT_NUM_ITERATIONS, propagation: str = "left", normalization: str = "l2", hyperedge_trim_n: int = 16, @@ -681,11 +684,12 @@ def predict_links( def propagate_gpu( graph: SparseMatrix, embeddings: np.ndarray, - num_iterations: int = 4, + num_iterations: int = DEFAULT_NUM_ITERATIONS, propagation: str = "left", normalization: str = "l2", device: str = "cuda", callback: Optional[Callable[[int, np.ndarray], None]] = None, + whiten: bool = True, ) -> np.ndarray: _validate_propagation(propagation) @@ -726,6 +730,9 @@ def propagate_gpu( norms = torch.norm(emb, p=1, dim=1, keepdim=True).clamp(min=1e-10) emb = emb / norms + if whiten: + emb = _whiten_embeddings_torch(emb) + if callback is not None: callback(i, emb.cpu().numpy()) @@ -777,12 +784,13 @@ def find_most_similar( def embed_edge_features( graph: SparseMatrix, edge_features: Dict[str, np.ndarray], - feature_dim: int = 128, - num_iterations: int = 4, + feature_dim: int = DEFAULT_FEATURE_DIM, + num_iterations: int = DEFAULT_NUM_ITERATIONS, propagation: str = "left", normalization: str = "l2", combine: str = "concat", num_workers: Optional[int] = None, + whiten: bool = True, ) -> np.ndarray: from scipy.sparse import csr_matrix, diags @@ -791,6 +799,7 @@ def embed_edge_features( struct_emb = embed( graph, feature_dim=feature_dim, num_iterations=num_iterations, propagation=propagation, normalization=normalization, num_workers=num_workers, + whiten=whiten, ) if not edge_features: @@ -828,9 +837,7 @@ def embed_edge_features( H = node_feats for _ in range(num_iterations): H = (adj @ H) - feat_norms = np.linalg.norm(H, axis=1, keepdims=True) - feat_norms = np.maximum(feat_norms, 1e-10) - H = H / feat_norms + H = _postprocess_iteration(H.astype(np.float32), "l2", whiten).astype(np.float64) edge_emb = H.astype(np.float32) @@ -848,14 +855,15 @@ def embed_edge_features( class CleoraEmbedder: def __init__( self, - feature_dim: int = 128, - num_iterations: int = 4, + feature_dim: int = DEFAULT_FEATURE_DIM, + num_iterations: int = DEFAULT_NUM_ITERATIONS, propagation: str = "left", normalization: str = "l2", columns: str = "complex::reflexive::node", seed: int = 0, hyperedge_trim_n: int = 16, num_workers: Optional[int] = None, + whiten: bool = True, ): self.feature_dim = feature_dim self.num_iterations = num_iterations @@ -865,6 +873,7 @@ def __init__( self.seed = seed self.hyperedge_trim_n = hyperedge_trim_n self.num_workers = num_workers + self.whiten = whiten self.graph_ = None self.embeddings_ = None self.entity_ids_ = None @@ -881,6 +890,7 @@ def fit(self, edges: List[str], y=None): normalization=self.normalization, seed=self.seed, num_workers=self.num_workers, + whiten=self.whiten, ) self.entity_ids_ = list(self.graph_.entity_ids) return self @@ -917,6 +927,7 @@ def get_params(self, deep=True) -> Dict: "seed": self.seed, "hyperedge_trim_n": self.hyperedge_trim_n, "num_workers": self.num_workers, + "whiten": self.whiten, } def set_params(self, **params): @@ -947,3 +958,40 @@ def _normalize(embeddings: np.ndarray, method: str) -> np.ndarray: return embeddings else: raise ValueError(f"Unknown normalization method: {method}. Use 'l2', 'l1', 'spectral', or 'none'.") + + +def _postprocess_iteration( + embeddings: np.ndarray, + normalization: str, + whiten: bool, +) -> np.ndarray: + embeddings = _normalize(embeddings, normalization) + if whiten: + embeddings = whiten_embeddings(embeddings) + return embeddings + + +def _compute_rmse(current: np.ndarray, previous: np.ndarray) -> float: + diff = current.astype(np.float64, copy=False) - previous.astype(np.float64, copy=False) + return float(np.sqrt(np.mean(diff * diff))) + + +def _whiten_embeddings_torch(embeddings): + import torch + + n = embeddings.shape[0] + if n <= 1: + return embeddings.clone() + + mean = embeddings.mean(dim=0, keepdim=True) + centered = embeddings - mean + cov = centered.transpose(0, 1).matmul(centered) / max(n - 1, 1) + + eigenvalues, eigenvectors = torch.linalg.eigh(cov) + order = torch.argsort(eigenvalues, descending=True) + eigenvalues = eigenvalues[order] + eigenvectors = eigenvectors[:, order] + + scale = torch.rsqrt(torch.clamp(eigenvalues, min=1e-10)) + transform = eigenvectors * scale.unsqueeze(0) + return centered.matmul(transform) diff --git a/pycleora/algorithms.py b/pycleora/algorithms.py index 251e607..c99bfea 100644 --- a/pycleora/algorithms.py +++ b/pycleora/algorithms.py @@ -22,7 +22,7 @@ def _graph_to_adjacency(graph): def embed_prone( graph, - feature_dim: int = 128, + feature_dim: int = 256, mu: float = 0.2, theta: float = 0.5, seed: int = 0, @@ -66,8 +66,8 @@ def embed_prone( def embed_randne( graph, - feature_dim: int = 128, - num_iterations: int = 3, + feature_dim: int = 256, + num_iterations: int = 40, weights: Optional[List[float]] = None, seed: int = 0, ) -> np.ndarray: @@ -102,7 +102,7 @@ def embed_randne( def embed_hope( graph, - feature_dim: int = 128, + feature_dim: int = 256, beta: float = 0.1, ) -> np.ndarray: n = graph.num_entities @@ -151,7 +151,7 @@ def embed_hope( def embed_netmf( graph, - feature_dim: int = 128, + feature_dim: int = 256, window_size: int = 5, negative_samples: float = 1.0, ) -> np.ndarray: @@ -200,7 +200,7 @@ def embed_netmf( def embed_grarep( graph, - feature_dim: int = 128, + feature_dim: int = 256, max_step: int = 4, ) -> np.ndarray: n = graph.num_entities @@ -341,7 +341,7 @@ def _walks_to_embeddings(walks, n, feature_dim, window_size): def embed_deepwalk( graph, - feature_dim: int = 128, + feature_dim: int = 256, num_walks: int = 10, walk_length: int = 80, window_size: int = 5, @@ -355,7 +355,7 @@ def embed_deepwalk( def embed_node2vec( graph, - feature_dim: int = 128, + feature_dim: int = 256, num_walks: int = 10, walk_length: int = 80, window_size: int = 5, diff --git a/pycleora/benchmark.py b/pycleora/benchmark.py index fbfa68e..2c74a08 100644 --- a/pycleora/benchmark.py +++ b/pycleora/benchmark.py @@ -63,7 +63,7 @@ def benchmark_algorithms( def benchmark_datasets( dataset_names: List[str], embed_fn: Callable, - feature_dim: int = 128, + feature_dim: int = 256, seed: int = 42, ) -> Dict: from .datasets import load_dataset diff --git a/pycleora/cli.py b/pycleora/cli.py index 62abbec..d0af5a8 100644 --- a/pycleora/cli.py +++ b/pycleora/cli.py @@ -14,8 +14,8 @@ def main(): embed_parser = subparsers.add_parser("embed", help="Generate graph embeddings") embed_parser.add_argument("--input", "-i", required=True, help="Input edge file (TSV/CSV/space-separated)") embed_parser.add_argument("--output", "-o", required=True, help="Output file (npz/csv/tsv)") - embed_parser.add_argument("--dim", "-d", type=int, default=128, help="Embedding dimension (default: 128)") - embed_parser.add_argument("--iterations", "-n", type=int, default=4, help="Number of iterations (default: 4)") + embed_parser.add_argument("--dim", "-d", type=int, default=256, help="Embedding dimension (default: 256)") + embed_parser.add_argument("--iterations", "-n", type=int, default=40, help="Number of iterations (default: 40)") embed_parser.add_argument("--propagation", "-p", choices=["left", "symmetric"], default="left") embed_parser.add_argument("--normalization", choices=["l2", "l1", "none"], default="l2") embed_parser.add_argument("--columns", "-c", default="complex::reflexive::node", help="Column definition") @@ -30,14 +30,14 @@ def main(): bench_parser = subparsers.add_parser("benchmark", help="Run benchmarks") bench_parser.add_argument("--dataset", "-d", default="karate_club", help="Dataset name") - bench_parser.add_argument("--dim", type=int, default=128) + bench_parser.add_argument("--dim", type=int, default=256) similar_parser = subparsers.add_parser("similar", help="Find similar entities") similar_parser.add_argument("--input", "-i", required=True) similar_parser.add_argument("--columns", "-c", default="complex::reflexive::node") similar_parser.add_argument("--entity", "-e", required=True, help="Query entity") similar_parser.add_argument("--top-k", "-k", type=int, default=10) - similar_parser.add_argument("--dim", "-d", type=int, default=128) + similar_parser.add_argument("--dim", "-d", type=int, default=256) args = parser.parse_args() @@ -145,7 +145,7 @@ def _cmd_benchmark(args): graph = SparseMatrix.from_iterator(iter(ds["edges"]), ds["columns"]) algorithms = { - "cleora": lambda g: embed(g, args.dim, 4), + "cleora": lambda g: embed(g, args.dim, 40), "prone": lambda g: embed_prone(g, args.dim), "randne": lambda g: embed_randne(g, args.dim), "deepwalk": lambda g: embed_deepwalk(g, args.dim), diff --git a/pycleora/hetero.py b/pycleora/hetero.py index 48f877b..fb902ed 100644 --- a/pycleora/hetero.py +++ b/pycleora/hetero.py @@ -88,12 +88,13 @@ def to_homogeneous_edges(self) -> List[str]: def embed_per_relation( self, - feature_dim: int = 128, - num_iterations: int = 4, + feature_dim: int = 256, + num_iterations: int = 40, propagation: str = "left", normalization: str = "l2", combine: str = "concat", seed: int = 0, + whiten: bool = True, ) -> Tuple[Dict[str, SparseMatrix], Dict[str, np.ndarray], Optional[np.ndarray]]: from . import embed @@ -114,7 +115,8 @@ def embed_per_relation( graph = SparseMatrix.from_iterator(iter(edge_strs), columns) emb = embed(graph, feature_dim=feature_dim, num_iterations=num_iterations, - propagation=propagation, normalization=normalization, seed=seed) + propagation=propagation, normalization=normalization, seed=seed, + whiten=whiten) graphs[et_name] = graph embeddings[et_name] = emb @@ -173,10 +175,11 @@ def embed_per_relation( def embed_metapath( self, metapath: List[str], - feature_dim: int = 128, - num_iterations: int = 4, + feature_dim: int = 256, + num_iterations: int = 40, normalization: str = "l2", seed: int = 0, + whiten: bool = True, ) -> Tuple[SparseMatrix, np.ndarray]: from . import embed @@ -231,7 +234,7 @@ def compose_paths(adj_list): graph = SparseMatrix.from_iterator(iter(edge_strs), columns) emb = embed(graph, feature_dim=feature_dim, num_iterations=num_iterations, - normalization=normalization, seed=seed) + normalization=normalization, seed=seed, whiten=whiten) return graph, emb diff --git a/pyproject.toml b/pyproject.toml index ed37e12..cb6007c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -version = "3.2.0" +version = "3.2.1" description = "Fast CPU-only graph embedding library with Rust core. Supports Cleora, DeepWalk, Node2Vec, ProNE, MLP, and more." readme = { file = "README.md", content-type = "text/markdown" } authors = [ diff --git a/setup.py b/setup.py index f0f977c..af13250 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup( name="pycleora", - version="3.0.0", + version="3.2.1", description="Fast CPU-only graph embedding library with Rust core", long_description=open("README.md").read() if __import__("os").path.exists("README.md") else "", long_description_content_type="text/markdown", diff --git a/src/embedding.rs b/src/embedding.rs index 7d62c34..3831508 100644 --- a/src/embedding.rs +++ b/src/embedding.rs @@ -27,7 +27,13 @@ impl NdArrayMatrix { .unwrap(); pool.install(|| { - Self::spmm_kernel(sparse_matrix_reader, other, &markov_type, dim, new_matrix.view_mut()); + Self::spmm_kernel( + sparse_matrix_reader, + other, + &markov_type, + dim, + new_matrix.view_mut(), + ); }); new_matrix } diff --git a/src/lib.rs b/src/lib.rs index 5a2343c..134b351 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,6 @@ +use rustc_hash::FxHasher; use std::collections::HashMap; use std::hash::Hasher; -use rustc_hash::FxHasher; use bincode::{deserialize, serialize}; use ndarray::{Array1, Array2, ArrayViewMut2, Axis, Ix1, Ix2}; @@ -53,8 +53,7 @@ impl SparseMatrix { num_workers: Option, ) -> Result { let columns = configuration::parse_fields(columns)?; - let matrix_desc = create_sparse_matrix_descriptor(&columns) - .map_err(|e| e.to_string())?; + let matrix_desc = create_sparse_matrix_descriptor(&columns).map_err(|e| e.to_string())?; let workers = num_workers.unwrap_or_else(num_cpus::get).max(1); let config = Configuration { seed: None, @@ -147,15 +146,18 @@ impl SparseMatrix { return Err(PyValueError::new_err("At least one file path is required")); } for filepath in filepaths.iter() { - if !filepath.ends_with(".tsv") && !filepath.ends_with(".csv") && !filepath.ends_with(".txt") { - return Err(PyValueError::new_err( - format!("Unsupported file format: {}. Supported: .tsv, .csv, .txt", filepath) - )); + if !filepath.ends_with(".tsv") + && !filepath.ends_with(".csv") + && !filepath.ends_with(".txt") + { + return Err(PyValueError::new_err(format!( + "Unsupported file format: {}. Supported: .tsv, .csv, .txt", + filepath + ))); } } - let columns = configuration::parse_fields(columns) - .map_err(PyValueError::new_err)?; + let columns = configuration::parse_fields(columns).map_err(PyValueError::new_err)?; let matrix_desc = create_sparse_matrix_descriptor(&columns).map_err(PyValueError::new_err)?; @@ -329,9 +331,12 @@ impl SparseMatrix { let markov_type = match propagation { "left" => MarkovType::Left, "symmetric" => MarkovType::Symmetric, - _ => return Err(PyValueError::new_err(format!( - "Unknown propagation '{}'. Use 'left' or 'symmetric'.", propagation - ))), + _ => { + return Err(PyValueError::new_err(format!( + "Unknown propagation '{}'. Use 'left' or 'symmetric'.", + propagation + ))) + } }; let mut vectors = Array2::zeros([self.entity_ids.len(), feature_dim]); @@ -345,7 +350,13 @@ impl SparseMatrix { .build() .unwrap(); pool.install(|| { - NdArrayMatrix::embed_full(self, vectors, markov_type, num_iterations, residual_weight) + NdArrayMatrix::embed_full( + self, + vectors, + markov_type, + num_iterations, + residual_weight, + ) }) }); @@ -367,9 +378,12 @@ impl SparseMatrix { let markov_type = match propagation { "left" => MarkovType::Left, "symmetric" => MarkovType::Symmetric, - _ => return Err(PyValueError::new_err(format!( - "Unknown propagation '{}'. Use 'left' or 'symmetric'.", propagation - ))), + _ => { + return Err(PyValueError::new_err(format!( + "Unknown propagation '{}'. Use 'left' or 'symmetric'.", + propagation + ))) + } }; let mut vectors = Array2::zeros([self.entity_ids.len(), feature_dim]); @@ -384,8 +398,12 @@ impl SparseMatrix { .unwrap(); pool.install(|| { NdArrayMatrix::embed_full_with_convergence( - self, vectors, markov_type, max_iterations, - residual_weight, convergence_threshold, + self, + vectors, + markov_type, + max_iterations, + residual_weight, + convergence_threshold, ) }) }); diff --git a/src/pipeline.rs b/src/pipeline.rs index fb5a9ac..e3867c4 100644 --- a/src/pipeline.rs +++ b/src/pipeline.rs @@ -223,9 +223,15 @@ where fn parse_line(line: &str) -> Vec> { let trimmed = line.trim(); if trimmed.contains('\t') { - trimmed.split('\t').map(|c| c.split(' ').collect()).collect() + trimmed + .split('\t') + .map(|c| c.split(' ').collect()) + .collect() } else if trimmed.contains(',') { - trimmed.split(',').map(|c| c.trim().split(' ').collect()).collect() + trimmed + .split(',') + .map(|c| c.trim().split(' ').collect()) + .collect() } else { let mut result = Vec::with_capacity(1); result.push(trimmed.split(' ').collect()); diff --git a/tests/snapshot.rs b/tests/snapshot.rs index c152222..dc55ae1 100644 --- a/tests/snapshot.rs +++ b/tests/snapshot.rs @@ -8,8 +8,8 @@ mod tests { use ndarray_rand::rand_distr::Uniform; use ndarray_rand::RandomExt; - use cleora::embedding::{MarkovType, NdArrayMatrix}; - use cleora::sparse_matrix::SparseMatrix; + use pycleora::embedding::{MarkovType, NdArrayMatrix}; + use pycleora::sparse_matrix::SparseMatrix; fn round(arr: Array2) -> Array2 { arr.map(|v| (v * 1000.) as i32) diff --git a/tests/snapshots/snapshot__tests__markov_left_02.snap b/tests/snapshots/snapshot__tests__markov_left_02.snap index ed3e3b4..ab1d37e 100644 --- a/tests/snapshots/snapshot__tests__markov_left_02.snap +++ b/tests/snapshots/snapshot__tests__markov_left_02.snap @@ -1,11 +1,12 @@ --- source: tests/snapshot.rs +assertion_line: 31 expression: embedding_out --- [[5616, 5049, 4705, 4796, 5524, 4975, 4748, 4576, 4597, 5267, 5397, 5265, 4064, 5312, 5110, 4791, 4400, 5462, 5045, 4821, 4434, 5255, 5106, 4929, 5351, 4791, 5309, 4956, 4782, 4826, 5200, 4381], [6013, 4638, 4683, 5031, 5790, 4837, 4174, 4353, 5286, 5298, 5200, 5886, 4438, 5184, 5251, 5441, 4866, 5472, 4840, 4806, 3984, 4394, 5482, 3822, 5165, 4929, 5088, 5566, 4726, 4523, 5116, 4291], [5766, 4827, 4389, 4899, 5166, 4802, 4281, 5281, 4724, 5583, 4962, 5428, 3971, 4910, 5020, 4823, 4636, 5304, 5453, 4334, 4842, 5001, 5067, 4668, 5044, 5017, 5218, 5431, 4694, 4352, 4704, 4716], - [5543, 4674, 4300, 4837, 5177, 5186, 4680, 4371, 4797, 5347, 4756, 6032, 4214, 5094, 4964, 4494, 4004, 5390, 3761, 4587, 5102, 4595, 5235, 4517, 4836, 4944, 4896, 4980, 4510, 5311, 5008, 4847], + [5543, 4673, 4300, 4837, 5177, 5186, 4680, 4371, 4797, 5347, 4756, 6032, 4214, 5094, 4964, 4494, 4004, 5390, 3761, 4587, 5102, 4595, 5235, 4517, 4836, 4944, 4896, 4980, 4510, 5311, 5008, 4847], [4700, 5096, 4454, 4895, 4722, 5782, 4827, 4924, 4973, 5414, 5212, 5636, 4710, 5458, 5368, 5211, 4376, 4882, 4776, 4855, 4605, 4583, 5187, 5146, 5060, 4893, 5434, 4660, 4750, 4681, 4938, 3846], [5239, 4660, 3898, 4312, 5587, 4839, 4645, 4674, 4544, 6107, 5209, 5942, 4711, 5093, 4857, 5385, 4592, 4785, 5012, 4260, 4716, 5325, 5224, 4517, 5575, 4845, 5721, 5088, 5273, 4731, 5338, 4927], [5814, 5092, 5355, 4727, 5917, 5001, 4864, 4911, 4964, 5833, 4360, 5233, 5572, 4613, 5156, 4888, 4890, 5226, 4700, 4861, 4656, 4456, 5631, 4989, 4637, 5247, 5515, 4378, 4835, 5105, 4522, 4624], @@ -93,7 +94,7 @@ expression: embedding_out [4818, 4914, 5108, 4555, 5681, 5262, 4533, 4805, 4851, 5529, 5048, 5818, 4909, 4927, 5638, 4428, 4620, 5653, 4611, 4909, 4088, 4811, 5401, 4044, 5458, 5150, 5271, 5510, 4504, 4291, 5066, 4468], [5599, 4977, 4480, 5115, 5375, 4745, 4856, 4723, 5208, 5951, 4767, 5210, 4378, 4865, 5176, 5422, 4762, 5845, 4916, 4696, 4292, 5329, 5504, 4132, 4767, 5512, 5243, 5262, 4626, 4875, 4856, 5257], [5746, 4836, 5216, 4808, 5673, 5184, 4391, 4711, 4828, 5748, 4960, 5071, 4110, 4607, 4900, 5457, 4788, 5582, 4861, 5173, 4595, 4590, 5137, 4707, 4858, 4497, 5608, 5406, 5093, 5319, 4731, 4569], - [5530, 5399, 4731, 5356, 5082, 5521, 4665, 4789, 4640, 5488, 4909, 5037, 4439, 4479, 4959, 5139, 3966, 5678, 5491, 4421, 4659, 5199, 4970, 4929, 5463, 4465, 4833, 5415, 4761, 5395, 4560, 4168], + [5530, 5399, 4731, 5356, 5082, 5521, 4665, 4789, 4640, 5488, 4909, 5037, 4439, 4479, 4959, 5139, 3966, 5678, 5491, 4421, 4659, 5199, 4970, 4929, 5463, 4465, 4833, 5415, 4760, 5395, 4560, 4168], [5330, 5228, 4402, 5192, 5325, 4297, 5103, 4813, 5167, 5572, 4704, 5313, 3584, 4610, 4328, 5521, 5049, 4763, 4480, 4396, 5314, 5206, 4947, 4495, 4390, 5333, 4940, 5113, 5291, 5649, 4897, 4902], [5787, 4729, 4406, 5152, 5015, 4742, 4341, 4978, 4242, 5338, 5413, 5226, 4321, 4558, 4732, 5108, 4260, 4949, 5273, 4811, 4684, 5512, 4627, 4898, 4858, 4981, 4938, 4950, 5438, 5592, 5768, 5006], [5225, 4982, 4658, 5056, 5644, 4828, 4958, 5095, 4305, 4986, 5075, 5138, 4246, 4696, 4881, 5173, 4880, 5374, 5314, 4653, 4810, 4458, 5333, 4844, 5222, 5252, 5553, 5603, 4818, 4947, 4168, 4550], diff --git a/tests/snapshots/snapshot__tests__markov_sym_01.snap b/tests/snapshots/snapshot__tests__markov_sym_01.snap index 5e283e3..be302eb 100644 --- a/tests/snapshots/snapshot__tests__markov_sym_01.snap +++ b/tests/snapshots/snapshot__tests__markov_sym_01.snap @@ -1,5 +1,6 @@ --- source: tests/snapshot.rs +assertion_line: 40 expression: embedding_out --- [[5205, 4494, 5473, 6162, 5249, 5177, 4084, 5417, 1853, 5858, 5598, 2754, 3044, 2174, 3585, 6728, 7189, 5900, 4162, 3569, 2764, 2871, 6165, 5764, 2021, 2740, 3175, 5067, 4259, 5444, 4661, 3731], @@ -65,7 +66,7 @@ expression: embedding_out [6254, 4519, 4180, 5580, 6296, 5717, 5139, 2770, 2834, 4456, 4434, 3206, 5260, 2452, 6645, 6290, 6232, 4381, 5377, 6387, 2741, 6584, 4664, 4383, 5854, 3283, 6486, 3558, 6485, 2796, 5074, 3623], [6157, 5729, 7263, 4244, 3645, 6884, 6976, 5024, 2148, 6697, 4169, 7362, 4280, 5540, 3670, 2957, 4166, 6993, 5529, 7199, 3626, 6917, 7715, 2677, 7257, 6071, 4808, 3392, 5636, 5849, 5771, 4771], [3135, 4145, 2864, 1954, 2454, 4925, 2646, 6500, 2254, 5008, 2303, 5349, 5041, 3412, 2447, 4561, 6948, 3259, 6163, 5933, 6618, 6776, 2554, 5964, 4564, 6620, 2555, 6022, 3158, 6211, 5091, 3854], - [3199, 6867, 2625, 3601, 3597, 4384, 6193, 7195, 6304, 3790, 8070, 7418, 3524, 5509, 4408, 3982, 6865, 7205, 6507, 4110, 4049, 6413, 2927, 5525, 6187, 7511, 3512, 5652, 3305, 6312, 3549, 4508], + [3199, 6867, 2625, 3601, 3597, 4384, 6192, 7195, 6304, 3790, 8070, 7418, 3524, 5509, 4408, 3982, 6865, 7205, 6507, 4110, 4049, 6413, 2927, 5525, 6187, 7511, 3512, 5652, 3305, 6312, 3549, 4508], [6890, 4346, 6889, 3716, 4623, 4315, 5783, 3492, 5375, 4052, 6376, 5404, 4651, 4351, 4085, 4343, 5950, 6339, 5030, 3524, 6924, 2806, 3057, 5269, 5930, 5723, 4492, 5915, 5734, 4950, 6431, 6626], [7088, 6874, 2774, 5333, 3429, 6094, 4786, 3575, 4221, 5874, 5717, 3007, 4939, 2155, 4336, 5021, 6931, 7053, 6107, 7565, 3055, 6494, 2431, 4360, 5919, 2756, 3112, 3476, 5985, 5455, 3863, 7261], [6762, 7299, 3981, 3790, 7910, 4474, 7363, 6109, 7458, 3057, 5710, 3627, 3507, 6103, 2696, 3197, 6481, 3676, 5137, 3885, 4163, 2707, 3042, 3371, 3789, 5272, 3172, 5981, 4865, 3213, 4459, 2899], diff --git a/tests/snapshots/snapshot__tests__markov_sym_02.snap b/tests/snapshots/snapshot__tests__markov_sym_02.snap index 993a5fa..80418f7 100644 --- a/tests/snapshots/snapshot__tests__markov_sym_02.snap +++ b/tests/snapshots/snapshot__tests__markov_sym_02.snap @@ -1,5 +1,6 @@ --- source: tests/snapshot.rs +assertion_line: 49 expression: embedding_out --- [[5840, 5236, 4808, 4910, 5726, 5097, 4895, 4767, 4748, 5434, 5569, 5426, 4240, 5527, 5220, 4968, 4596, 5625, 5210, 5022, 4607, 5420, 5271, 5031, 5508, 4960, 5475, 5160, 4962, 4957, 5321, 4587], @@ -71,7 +72,7 @@ expression: embedding_out [5299, 4834, 4384, 4826, 5819, 5120, 5333, 5289, 5455, 4943, 5724, 5339, 4268, 4869, 5045, 5745, 5760, 5474, 5348, 5420, 4552, 4821, 4688, 4806, 5454, 4420, 5505, 5555, 4451, 5055, 5026, 4941], [5625, 5965, 5381, 4771, 6099, 5701, 5439, 4845, 4716, 5865, 5190, 6182, 4705, 5293, 5967, 5977, 4807, 5872, 5398, 5450, 5060, 5925, 6287, 4842, 5805, 5481, 5470, 5845, 5853, 4942, 5588, 5278], [5577, 5180, 4883, 5212, 5619, 4812, 4808, 4339, 4662, 5781, 4820, 6036, 4275, 4485, 5773, 4604, 4109, 5386, 4666, 4674, 4851, 4709, 5835, 5592, 5144, 5654, 5100, 4704, 5570, 5168, 5234, 4587], - [5389, 4321, 3726, 4633, 5217, 4517, 4646, 4850, 4318, 5785, 5031, 5005, 4531, 4615, 4527, 5025, 4589, 5025, 4441, 4310, 4223, 5258, 4917, 4772, 4987, 5280, 4583, 5237, 4900, 4442, 5272, 4579], + [5389, 4321, 3726, 4633, 5217, 4517, 4646, 4850, 4318, 5785, 5031, 5005, 4531, 4615, 4527, 5025, 4589, 5025, 4441, 4310, 4223, 5258, 4917, 4772, 4987, 5280, 4584, 5237, 4900, 4442, 5272, 4579], [5053, 4390, 4953, 4562, 4408, 4772, 4128, 4024, 3930, 4747, 5248, 5044, 4350, 4358, 4995, 4231, 3793, 4900, 4683, 4753, 4468, 4629, 4333, 4461, 4533, 4767, 4532, 4488, 4966, 4495, 4857, 4299], [5170, 5119, 4301, 4909, 5181, 5370, 4266, 4368, 4201, 4984, 5102, 5321, 4625, 5717, 4788, 4837, 4872, 5077, 5247, 4922, 5284, 4764, 5231, 4767, 5544, 5156, 5215, 5520, 5215, 5194, 5084, 4121], [5302, 5726, 4774, 4521, 6010, 5425, 4394, 5391, 4766, 5110, 4906, 5840, 4687, 5090, 5261, 5532, 4708, 5597, 5819, 4950, 4273, 4761, 5405, 5059, 4799, 4622, 5544, 5148, 5228, 5915, 4766, 5006],