Network_modeling_v002/Synthetic_node_expansion.py at main · collaborativebioinformatics/Network_modeling_v002 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
# “””
Synthetic Node Expansion Algorithm

Iteratively copies high-connectivity anchor nodes into low-dimensional
synthetic nodes until no new high-dim edges are created.

- Anchor nodes carry high-dimensional feature vectors
- Synthetic copies live in low-dim space
- Each anchor group shares exactly ONE high-dim edge (mobile, exclusive)
- Duplication of that edge to two low-dim nodes is allowed when
  Shannon entropy of co-occurrence distribution exceeds threshold theta
- Generation order: descending anchor degree
- Termination: no new synthetic nodes produced in a round

GPU Acceleration:
Uses RAPIDS (cugraph, cupy, cudf) if available, falls back to
networkx, numpy, pandas transparently.
“””

# —————————————————————————

# Backend selection — GPU (RAPIDS) or CPU (NetworkX / NumPy)

# —————————————————————————

try:
import cugraph as nx_backend
import cupy as np_backend
import cudf as pd_backend
GPU = True
print(”[INFO] RAPIDS detected — running on GPU.”)
except ImportError:
import networkx as nx_backend
import numpy as np_backend
import pandas as pd_backend
GPU = False
print(”[INFO] RAPIDS not found — falling back to CPU (NetworkX / NumPy).”)

import numpy as np          # always available; used for host-side bookkeeping
import networkx as nx       # always available; used as fallback graph type

# —————————————————————————

# Graph helpers — unified API over cugraph / networkx

# —————————————————————————

def create_graph():
if GPU:
return nx_backend.Graph()
return nx.Graph()

def add_node(G, node_id, **attrs):
if GPU:
# cuGraph graphs are edge-list based; node attrs tracked separately
pass
else:
G.add_node(node_id, **attrs)

def add_edge(G, u, v, **attrs):
if GPU:
G.add_edge(u, v)          # cuGraph edge list
else:
G.add_edge(u, v, **attrs)

def get_degrees(G, nodes):
“”“Return a dict {node: degree} for the given node list.”””
if GPU:
deg_df = G.degree()       # cuDF DataFrame: ‘vertex’, ‘degree’
deg_map = dict(zip(deg_df[‘vertex’].to_pandas(),
deg_df[‘degree’].to_pandas()))
else:
deg_map = dict(G.degree())
return {n: deg_map.get(n, 0) for n in nodes}

def get_neighbors(G, node):
if GPU:
return list(G.neighbors(node))
return list(G.neighbors(node))

# —————————————————————————

# Shannon entropy

# —————————————————————————

def shannon_entropy(counts):
“””
Compute Shannon entropy (bits) of a count vector.
H = -sum(p * log2(p))  for p > 0
“””
arr = np_backend.array(counts, dtype=float)
total = arr.sum()
if total == 0:
return 0.0
p = arr / total
# mask zeros to avoid log(0)
mask = p > 0
h = -np_backend.sum(p[mask] * np_backend.log2(p[mask]))
# return as Python float regardless of backend
return float(h)

# —————————————————————————

# Co-occurrence scoring

# —————————————————————————

def cooccurrence_affinity(edge, group_edges_list):
“””
For a given high-dim edge, compute how often it co-occurs with
edges in each group.

```
Parameters
----------
edge : hashable
    The high-dim edge being evaluated.
group_edges_list : list of sets
    Each element is the set of edges associated with one anchor group.

Returns
-------
counts : list of int
    Co-occurrence count of `edge` with each group's edge set.
"""
counts = []
for group_edges in group_edges_list:
    counts.append(int(edge in group_edges))
return counts
```

def best_group(counts):
“”“Return index of group with highest co-occurrence count.”””
return int(np.argmax(counts))

# —————————————————————————

# Core data structures

# —————————————————————————

class AnchorGroup:
“””
Represents one anchor node and all its synthetic copies.
Tracks the single mobile high-dim edge assignment.
“””
def **init**(self, anchor_id, high_dim_vector, high_dim_edge):
self.anchor_id = anchor_id
self.high_dim_vector = np_backend.array(high_dim_vector, dtype=float)
self.high_dim_edge = high_dim_edge      # the one shared high-dim edge
self.edge_holder = anchor_id            # which node currently holds it
self.members = [anchor_id]              # anchor + all synthetic copies
self.synthetic_count = 0

```
def add_synthetic(self, node_id):
    self.members.append(node_id)
    self.synthetic_count += 1

def reassign_edge(self, new_holder):
    """Move the high-dim edge to a different member of this group."""
    assert new_holder in self.members, "Holder must be a group member."
    self.edge_holder = new_holder
```

# —————————————————————————

# Similarity / edge rule (high-dim)

# —————————————————————————

def high_dim_similar(vec_a, vec_b, threshold=0.8):
“””
Cosine similarity edge rule in high-dim space.
Returns True if similarity >= threshold.
“””
a = np_backend.array(vec_a, dtype=float)
b = np_backend.array(vec_b, dtype=float)
norm_a = np_backend.linalg.norm(a)
norm_b = np_backend.linalg.norm(b)
if norm_a == 0 or norm_b == 0:
return False
sim = float(np_backend.dot(a, b) / (norm_a * norm_b))
return sim >= threshold

# —————————————————————————

# Synthetic node generation

# —————————————————————————

def generate_synthetic_vector(high_dim_vector, low_dim=8, seed=None):
“””
Generate a synthetic node with two representations:
- high_dim : perturbed copy of anchor vector (used for edge checking)
- low_dim  : random projection (the node’s actual low-dim embedding)

```
In production, replace the projection with PCA / UMAP.
"""
rng = np.random.default_rng(seed)
vec = np.array(high_dim_vector)
d = len(vec)

# High-dim representation: small perturbation of anchor vector
noise = rng.standard_normal(d) * 0.1
high_dim = vec + noise

# Low-dim representation: random projection
projection_matrix = rng.standard_normal((low_dim, d)) / np.sqrt(low_dim)
low_dim_vec = projection_matrix @ vec

return high_dim, low_dim_vec
```

# —————————————————————————

# Main algorithm

# —————————————————————————

def run_expansion(
anchor_data,
similarity_threshold=0.8,
entropy_threshold=0.9,
low_dim=8,
max_rounds=50,
verbose=True
):
“””
Synthetic Node Expansion Algorithm.

```
Parameters
----------
anchor_data : list of dicts, each with keys:
    'id'            : unique node identifier
    'vector'        : high-dim feature vector (list / array)
    'high_dim_edge' : identifier of the high-dim edge this anchor holds
    'edges'         : list of (neighbor_id, ...) graph edges for this node
similarity_threshold : float
    Cosine similarity cutoff for high-dim edge rule.
entropy_threshold : float
    Shannon entropy cutoff (bits) above which edge duplication is allowed.
    Max possible for 2 groups = log2(2) = 1.0 bit.
low_dim : int
    Dimensionality of synthetic node vectors.
max_rounds : int
    Safety cap on iterations.
verbose : bool
    Print per-round diagnostics.

Returns
-------
G : graph
    Final expanded graph (anchors + synthetic nodes).
groups : dict {anchor_id: AnchorGroup}
    All anchor groups with their current edge assignments.
synthetic_nodes : dict {node_id: dict}
    Metadata for all generated synthetic nodes.
"""

# --- Build initial graph and anchor groups ---
G = create_graph()
groups = {}
synthetic_nodes = {}
node_to_group = {}      # maps any node_id -> its AnchorGroup

for item in anchor_data:
    aid = item['id']
    group = AnchorGroup(aid, item['vector'], item['high_dim_edge'])
    groups[aid] = group
    node_to_group[aid] = group
    add_node(G, aid, kind='anchor', vector=item['vector'])
    for (nbr, *_) in item.get('edges', []):
        add_edge(G, aid, nbr)

anchor_ids = list(groups.keys())

# --- Iterative expansion ---
for round_num in range(1, max_rounds + 1):
    if verbose:
        print(f"\n=== Round {round_num} ===")

    # Rank anchors by current degree (descending)
    degrees = get_degrees(G, anchor_ids)
    ranked_anchors = sorted(anchor_ids, key=lambda a: degrees[a], reverse=True)

    new_nodes_this_round = []

    for anchor_id in ranked_anchors:
        group = groups[anchor_id]

        # Generate a synthetic copy — two representations:
        #   syn_high_dim : perturbed anchor vector (used for edge checking)
        #   syn_low_dim  : low-dim embedding (the node's actual position)
        anchor_vec = np.array(group.high_dim_vector.tolist() if GPU
                              else group.high_dim_vector)
        syn_high_dim, syn_low_dim = generate_synthetic_vector(
            anchor_vec,
            low_dim=low_dim,
            seed=round_num * 1000 + anchor_id if isinstance(anchor_id, int)
                 else None
        )
        syn_id = f"syn_{anchor_id}_r{round_num}"

        # Check if this synthetic node creates a new high-dim edge
        # by testing similarity against all other anchor vectors
        creates_new_edge = False
        for other_id, other_group in groups.items():
            if other_id == anchor_id:
                continue
            other_vec = np.array(other_group.high_dim_vector.tolist()
                                 if GPU else other_group.high_dim_vector)
            if high_dim_similar(syn_high_dim, other_vec, similarity_threshold):
                creates_new_edge = True
                break

        if not creates_new_edge:
            if verbose:
                print(f"  [{anchor_id}] Synthetic copy {syn_id} creates "
                      f"no new high-dim edges — skipping.")
            continue

        # Add synthetic node to graph and group
        add_node(G, syn_id, kind='synthetic', vector=syn_low_dim.tolist())
        add_edge(G, anchor_id, syn_id)
        group.add_synthetic(syn_id)
        node_to_group[syn_id] = group
        synthetic_nodes[syn_id] = {
            'anchor': anchor_id,
            'high_dim_vector': syn_high_dim.tolist(),
            'low_dim_vector': syn_low_dim.tolist(),
            'round': round_num
        }
        new_nodes_this_round.append(syn_id)

        if verbose:
            print(f"  [{anchor_id}] Generated {syn_id} "
                  f"(group size: {len(group.members)})")

        # --- Reassignment: does the edge move to the new synthetic node? ---
        # Build co-occurrence distributions for current holder vs new copy
        # Here we use neighbor edge sets as proxy for co-occurrence groups
        holder_edges = set(get_neighbors(G, group.edge_holder))
        syn_edges = set(get_neighbors(G, syn_id))

        group_edge_sets = [holder_edges, syn_edges]
        counts = cooccurrence_affinity(group.high_dim_edge, group_edge_sets)
        h = shannon_entropy(counts)

        if verbose:
            print(f"    Edge '{group.high_dim_edge}' co-occurrence counts: "
                  f"{counts}, H={h:.4f} bits")

        winner = best_group(counts)
        candidate_holder = (group.edge_holder if winner == 0 else syn_id)

        if candidate_holder != group.edge_holder:
            group.reassign_edge(candidate_holder)
            if verbose:
                print(f"    → Edge reassigned to {candidate_holder}")

        # --- Duplication check (entropy threshold) ---
        if h >= entropy_threshold:
            if verbose:
                print(f"    → H={h:.4f} >= θ={entropy_threshold}: "
                      f"edge '{group.high_dim_edge}' duplicated to "
                      f"both {group.edge_holder} and {syn_id}")
            synthetic_nodes[syn_id]['duplicate_edge'] = True

    # --- Termination check ---
    if not new_nodes_this_round:
        if verbose:
            print(f"\n[DONE] No new synthetic nodes in round {round_num}. "
                  f"Algorithm converged.")
        break
    else:
        if verbose:
            print(f"  → {len(new_nodes_this_round)} new synthetic node(s) added.")
else:
    print(f"[WARN] Reached max_rounds={max_rounds} without convergence.")

return G, groups, synthetic_nodes
```

# —————————————————————————

# Summary report

# —————————————————————————

def print_summary(G, groups, synthetic_nodes):
print(”\n” + “=”*50)
print(“EXPANSION SUMMARY”)
print(”=”*50)
n_anchors = len(groups)
n_synthetic = len(synthetic_nodes)
print(f”Anchor nodes    : {n_anchors}”)
print(f”Synthetic nodes : {n_synthetic}”)
if not GPU:
print(f”Total edges     : {G.number_of_edges()}”)
print()
for aid, group in groups.items():
print(f”Group [{aid}]”)
print(f”  Members      : {group.members}”)
print(f”  Edge holder  : {group.edge_holder}”)
print(f”  High-dim edge: {group.high_dim_edge}”)
print()
duplicated = [n for n, m in synthetic_nodes.items()
if m.get(‘duplicate_edge’)]
if duplicated:
print(f”Nodes with duplicated edge: {duplicated}”)
else:
print(“No edge duplications triggered.”)

# —————————————————————————

# Example / test

# —————————————————————————

if **name** == “**main**”:

```
np.random.seed(42)

# Small synthetic dataset — 4 anchor nodes with 16-dim feature vectors
# Edges represent existing graph connections between anchors
anchor_data = [
    {
        'id': 0,
        'vector': np.random.randn(16).tolist(),
        'high_dim_edge': 'hd_edge_A',
        'edges': [(1,), (2,)]
    },
    {
        'id': 1,
        'vector': np.random.randn(16).tolist(),
        'high_dim_edge': 'hd_edge_B',
        'edges': [(0,), (3,)]
    },
    {
        'id': 2,
        'vector': np.random.randn(16).tolist(),
        'high_dim_edge': 'hd_edge_C',
        'edges': [(0,)]
    },
    {
        'id': 3,
        'vector': np.random.randn(16).tolist(),
        'high_dim_edge': 'hd_edge_D',
        'edges': [(1,)]
    },
]

G, groups, synthetic_nodes = run_expansion(
    anchor_data,
    similarity_threshold=0.1,   # low threshold so test data produces edges
    entropy_threshold=0.95,
    low_dim=8,
    max_rounds=10,
    verbose=True
)

print_summary(G, groups, synthetic_nodes)
```