Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 19 additions & 7 deletions easygraph/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# risky imports
try:
from easygraph.datasets.get_sample_graph import *
from easygraph.datasets.gnn_benchmark import *
Expand All @@ -8,16 +9,27 @@
from easygraph.datasets.karate import KarateClubDataset
from easygraph.datasets.mathoverflow_answers import mathoverflow_answers

from .citation_graph import CitationGraphDataset
from .citation_graph import CiteseerGraphDataset
from .citation_graph import CoraBinary
from .citation_graph import CoraGraphDataset
from .citation_graph import PubmedGraphDataset
from .ppi import LegacyPPIDataset
from .ppi import PPIDataset

except:
except Exception as e:
print(
" Please install Pytorch before use graph-related datasets and"
" hypergraph-related datasets."
)

from .amazon_photo import AmazonPhotoDataset
from .arxiv import ArxivHEPTHDataset
from .citation_graph import CitationGraphDataset
from .citation_graph import CiteseerGraphDataset
from .citation_graph import CoraBinary
from .citation_graph import CoraGraphDataset
from .citation_graph import PubmedGraphDataset
from .coauthor import CoauthorCSDataset
from .facebook_ego import FacebookEgoNetDataset
from .flickr import FlickrDataset
from .github import GitHubUsersDataset
from .reddit import RedditDataset
from .roadnet import RoadNetCADataset
from .twitter_ego import TwitterEgoDataset
from .web_google import WebGoogleDataset
from .wiki_topcats import WikiTopCatsDataset
110 changes: 110 additions & 0 deletions easygraph/datasets/amazon_photo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import os

import easygraph as eg
import numpy as np
import scipy.sparse as sp

from easygraph.classes.graph import Graph

from .graph_dataset_base import EasyGraphBuiltinDataset
from .utils import data_type_dict
from .utils import download
from .utils import extract_archive
from .utils import tensor


class AmazonPhotoDataset(EasyGraphBuiltinDataset):
r"""Amazon Electronics Photo co-purchase graph dataset.

Nodes represent products, and edges link products frequently co-purchased.
Node features are bag-of-words of product reviews. The task is to classify
the product category.

Statistics:

- Nodes: 7,650
- Edges: 119,081
- Number of Classes: 8
- Features: 745

Parameters
----------
raw_dir : str, optional
Raw file directory to download/contains the input data directory. Default: None
force_reload : bool, optional
Whether to reload the dataset. Default: False
verbose : bool, optional
Whether to print out progress information. Default: True
transform : callable, optional
A transform that takes in a :class:`~easygraph.Graph` object and returns
a transformed version. The :class:`~easygraph.Graph` object will be
transformed before every access.

Examples
--------
>>> from easygraph.datasets import AmazonPhotoDataset
>>> dataset = AmazonPhotoDataset()
>>> g = dataset[0]
>>> print(g.number_of_nodes())
>>> print(g.number_of_edges())
>>> print(g.nodes[0]['feat'].shape)
>>> print(g.nodes[0]['label'])
>>> print(dataset.num_classes)
"""

def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None):
name = "amazon_photo"
url = "https://data.dgl.ai/dataset/amazon_co_buy_photo.zip"
super(AmazonPhotoDataset, self).__init__(
name=name,
url=url,
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose,
transform=transform,
)

def process(self):
path = os.path.join(self.raw_path, "amazon_co_buy_photo.npz")
data = np.load(path)

adj = sp.csr_matrix(
(data["adj_data"], data["adj_indices"], data["adj_indptr"]),
shape=data["adj_shape"],
)

features = sp.csr_matrix(
(data["attr_data"], data["attr_indices"], data["attr_indptr"]),
shape=data["attr_shape"],
).todense()

labels = data["labels"]

g = eg.Graph()
g.add_edges_from(list(zip(*adj.nonzero())))

for i in range(features.shape[0]):
g.add_node(i, feat=np.array(features[i]).squeeze(), label=int(labels[i]))

self._g = g
self._num_classes = len(np.unique(labels))

if self.verbose:
print("Finished loading AmazonPhoto dataset.")
print(f" NumNodes: {g.number_of_nodes()}")
print(f" NumEdges: {g.number_of_edges()}")
print(f" NumFeats: {features.shape[1]}")
print(f" NumClasses: {self._num_classes}")

def __getitem__(self, idx):
assert idx == 0, "AmazonPhotoDataset only contains one graph"
if self._g is None:
raise ValueError("Graph has not been loaded or processed correctly.")
return self._g if self._transform is None else self._transform(self._g)

def __len__(self):
return 1

@property
def num_classes(self):
return self._num_classes
106 changes: 106 additions & 0 deletions easygraph/datasets/arxiv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""Arxiv HEP-TH Citation Network

This dataset represents the citation network of preprints from the High Energy Physics - Theory (HEP-TH) category on arXiv, covering the period from January 1993 to April 2003.

Each node corresponds to a paper, and a directed edge from paper A to paper B indicates that A cites B.

No features or labels are included in this dataset.

Statistics:
- Nodes: 27,770
- Edges: 352,807
- Features: None
- Labels: None

Reference:
J. Leskovec, J. Kleinberg and C. Faloutsos, "Graphs over Time: Densification Laws, Shrinking Diameters and Possible Explanations,"
in KDD 2005. Dataset: https://snap.stanford.edu/data/cit-HepTh.html
"""

import gzip
import os
import shutil

import easygraph as eg

from easygraph.classes.graph import Graph

from .graph_dataset_base import EasyGraphBuiltinDataset
from .utils import download


class ArxivHEPTHDataset(EasyGraphBuiltinDataset):
r"""Arxiv HEP-TH citation network dataset.

Parameters
----------
raw_dir : str, optional
Directory to store the raw downloaded files. Default: None
force_reload : bool, optional
Whether to re-download and process the dataset. Default: False
verbose : bool, optional
Whether to print detailed processing logs. Default: True
transform : callable, optional
Optional transform to apply on the graph.

Examples
--------
>>> from easygraph.datasets import ArxivHEPTHDataset
>>> dataset = ArxivHEPTHDataset()
>>> g = dataset[0]
>>> print("Nodes:", g.number_of_nodes())
>>> print("Edges:", g.number_of_edges())
"""

def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None):
name = "cit-HepTh"
url = "https://snap.stanford.edu/data/cit-HepTh.txt.gz"
super(ArxivHEPTHDataset, self).__init__(
name=name,
url=url,
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose,
transform=transform,
)

def download(self):
r"""Download and decompress the .txt.gz file."""
compressed_path = os.path.join(self.raw_dir, self.name + ".txt.gz")
extracted_path = os.path.join(self.raw_path, self.name + ".txt")

download(self.url, path=compressed_path)

if not os.path.exists(self.raw_path):
os.makedirs(self.raw_path)

with gzip.open(compressed_path, "rb") as f_in:
with open(extracted_path, "wb") as f_out:
shutil.copyfileobj(f_in, f_out)

def process(self):
graph = eg.DiGraph() # Citation network is directed
edge_list_path = os.path.join(self.raw_path, self.name + ".txt")

with open(edge_list_path, "r") as f:
for line in f:
if line.startswith("#") or line.strip() == "":
continue
u, v = map(int, line.strip().split())
graph.add_edge(u, v)

self._g = graph
self._num_nodes = graph.number_of_nodes()
self._num_edges = graph.number_of_edges()

if self.verbose:
print("Finished loading Arxiv HEP-TH dataset.")
print(f" NumNodes: {self._num_nodes}")
print(f" NumEdges: {self._num_edges}")

def __getitem__(self, idx):
assert idx == 0, "ArxivHEPTHDataset only contains one graph"
return self._g if self._transform is None else self._transform(self._g)

def __len__(self):
return 1
6 changes: 3 additions & 3 deletions easygraph/datasets/citation_graph.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Cora, citeseer, pubmed dataset.
"""Cora, citeseer, pubmed dataset."""

"""
from __future__ import absolute_import

import os
Expand Down Expand Up @@ -53,9 +52,10 @@ class CitationGraphDataset(EasyGraphBuiltinDataset):
reorder : bool
Whether to reorder the graph using :func:`~eg.reorder_graph`. Default: False.
"""

_urls = {
"cora_v2": "dataset/cora_v2.zip",
"citeseer": "dataset/citeSeer.zip",
"citeseer": "dataset/citeseer.zip",
"pubmed": "dataset/pubmed.zip",
}

Expand Down
118 changes: 118 additions & 0 deletions easygraph/datasets/coauthor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
"""CoauthorCS Dataset

This dataset contains a co-authorship network of authors who submitted papers to CS category.
Each node represents an author and edges represent co-authorships.
Node features are bag-of-words representations of keywords in the author's papers.
The task is node classification, with labels indicating the primary field of study.

Statistics:
- Nodes: 18333
- Edges: 81894
- Feature Dim: 6805
- Classes: 15

Source: https://github.com/dmlc/dgl/tree/master/examples/pytorch/cluster_gcn
"""

import os

import easygraph as eg
import numpy as np
import scipy.sparse as sp

from easygraph.classes.graph import Graph

from .graph_dataset_base import EasyGraphBuiltinDataset
from .utils import data_type_dict
from .utils import download
from .utils import extract_archive
from .utils import tensor


class CoauthorCSDataset(EasyGraphBuiltinDataset):
r"""CoauthorCS citation network dataset.

Nodes are authors, and edges indicate co-authorship relationships. Each node
has a bag-of-words feature vector and a label denoting the primary research field.

Parameters
----------
raw_dir : str, optional
Directory to store the raw downloaded files. Default: None
force_reload : bool, optional
Whether to re-download and process the dataset. Default: False
verbose : bool, optional
Whether to print detailed processing logs. Default: True
transform : callable, optional
Transform to apply to the graph on access.

Examples
--------
>>> from easygraph.datasets import CoauthorCSDataset
>>> dataset = CoauthorCSDataset()
>>> g = dataset[0]
>>> print("Nodes:", g.number_of_nodes())
>>> print("Edges:", g.number_of_edges())
>>> print("Feature shape:", g.nodes[0]['feat'].shape)
>>> print("Label:", g.nodes[0]['label'])
>>> print("Number of classes:", dataset.num_classes)
"""

def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None):
name = "coauthor_cs"
url = "https://data.dgl.ai/dataset/coauthor_cs.zip"
super(CoauthorCSDataset, self).__init__(
name=name,
url=url,
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose,
transform=transform,
)

def process(self):
path = os.path.join(self.raw_path, "coauthor_cs.npz")
data = np.load(path)

# Reconstruct adjacency matrix
adj = sp.csr_matrix(
(data["adj_data"], data["adj_indices"], data["adj_indptr"]),
shape=data["adj_shape"],
)

# Reconstruct feature matrix
features = sp.csr_matrix(
(data["attr_data"], data["attr_indices"], data["attr_indptr"]),
shape=data["attr_shape"],
).todense()

labels = data["labels"]

g = eg.Graph()
g.add_edges_from(list(zip(*adj.nonzero())))

for i in range(features.shape[0]):
g.add_node(i, feat=np.array(features[i]).squeeze(), label=int(labels[i]))

self._g = g
self._num_classes = len(np.unique(labels))

if self.verbose:
print("Finished loading CoauthorCS dataset.")
print(f" NumNodes: {g.number_of_nodes()}")
print(f" NumEdges: {g.number_of_edges()}")
print(f" NumFeats: {features.shape[1]}")
print(f" NumClasses: {self._num_classes}")

def __getitem__(self, idx):
assert idx == 0, "CoauthorCSDataset only contains one graph"
if self._g is None:
raise ValueError("Graph has not been loaded or processed correctly.")
return self._g if self._transform is None else self._transform(self._g)

def __len__(self):
return 1

@property
def num_classes(self):
return self._num_classes
Loading
Loading