easy-graph · eggpig · Jul 18, 2025 · Jul 7, 2025 · Jul 7, 2025 · Jul 7, 2025
diff --git a/easygraph/datasets/__init__.py b/easygraph/datasets/__init__.py
@@ -1,3 +1,4 @@
+# risky imports
 try:
     from easygraph.datasets.get_sample_graph import *
     from easygraph.datasets.gnn_benchmark import *
@@ -8,16 +9,27 @@
     from easygraph.datasets.karate import KarateClubDataset
     from easygraph.datasets.mathoverflow_answers import mathoverflow_answers
 
-    from .citation_graph import CitationGraphDataset
-    from .citation_graph import CiteseerGraphDataset
-    from .citation_graph import CoraBinary
-    from .citation_graph import CoraGraphDataset
-    from .citation_graph import PubmedGraphDataset
     from .ppi import LegacyPPIDataset
     from .ppi import PPIDataset
-
-except:
+except Exception as e:
     print(
         " Please install Pytorch before use graph-related datasets and"
         " hypergraph-related datasets."
     )
+
+from .amazon_photo import AmazonPhotoDataset
+from .arxiv import ArxivHEPTHDataset
+from .citation_graph import CitationGraphDataset
+from .citation_graph import CiteseerGraphDataset
+from .citation_graph import CoraBinary
+from .citation_graph import CoraGraphDataset
+from .citation_graph import PubmedGraphDataset
+from .coauthor import CoauthorCSDataset
+from .facebook_ego import FacebookEgoNetDataset
+from .flickr import FlickrDataset
+from .github import GitHubUsersDataset
+from .reddit import RedditDataset
+from .roadnet import RoadNetCADataset
+from .twitter_ego import TwitterEgoDataset
+from .web_google import WebGoogleDataset
+from .wiki_topcats import WikiTopCatsDataset
diff --git a/easygraph/datasets/amazon_photo.py b/easygraph/datasets/amazon_photo.py
@@ -0,0 +1,110 @@
+import os
+
+import easygraph as eg
+import numpy as np
+import scipy.sparse as sp
+
+from easygraph.classes.graph import Graph
+
+from .graph_dataset_base import EasyGraphBuiltinDataset
+from .utils import data_type_dict
+from .utils import download
+from .utils import extract_archive
+from .utils import tensor
+
+
+class AmazonPhotoDataset(EasyGraphBuiltinDataset):
+    r"""Amazon Electronics Photo co-purchase graph dataset.
+
+    Nodes represent products, and edges link products frequently co-purchased.
+    Node features are bag-of-words of product reviews. The task is to classify
+    the product category.
+
+    Statistics:
+
+    - Nodes: 7,650
+    - Edges: 119,081
+    - Number of Classes: 8
+    - Features: 745
+
+    Parameters
+    ----------
+    raw_dir : str, optional
+        Raw file directory to download/contains the input data directory. Default: None
+    force_reload : bool, optional
+        Whether to reload the dataset. Default: False
+    verbose : bool, optional
+        Whether to print out progress information. Default: True
+    transform : callable, optional
+        A transform that takes in a :class:`~easygraph.Graph` object and returns
+        a transformed version. The :class:`~easygraph.Graph` object will be
+        transformed before every access.
+
+    Examples
+    --------
+    >>> from easygraph.datasets import AmazonPhotoDataset
+    >>> dataset = AmazonPhotoDataset()
+    >>> g = dataset[0]
+    >>> print(g.number_of_nodes())
+    >>> print(g.number_of_edges())
+    >>> print(g.nodes[0]['feat'].shape)
+    >>> print(g.nodes[0]['label'])
+    >>> print(dataset.num_classes)
+    """
+
+    def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None):
+        name = "amazon_photo"
+        url = "https://data.dgl.ai/dataset/amazon_co_buy_photo.zip"
+        super(AmazonPhotoDataset, self).__init__(
+            name=name,
+            url=url,
+            raw_dir=raw_dir,
+            force_reload=force_reload,
+            verbose=verbose,
+            transform=transform,
+        )
+
+    def process(self):
+        path = os.path.join(self.raw_path, "amazon_co_buy_photo.npz")
+        data = np.load(path)
+
+        adj = sp.csr_matrix(
+            (data["adj_data"], data["adj_indices"], data["adj_indptr"]),
+            shape=data["adj_shape"],
+        )
+
+        features = sp.csr_matrix(
+            (data["attr_data"], data["attr_indices"], data["attr_indptr"]),
+            shape=data["attr_shape"],
+        ).todense()
+
+        labels = data["labels"]
+
+        g = eg.Graph()
+        g.add_edges_from(list(zip(*adj.nonzero())))
+
+        for i in range(features.shape[0]):
+            g.add_node(i, feat=np.array(features[i]).squeeze(), label=int(labels[i]))
+
+        self._g = g
+        self._num_classes = len(np.unique(labels))
+
+        if self.verbose:
+            print("Finished loading AmazonPhoto dataset.")
+            print(f"  NumNodes: {g.number_of_nodes()}")
+            print(f"  NumEdges: {g.number_of_edges()}")
+            print(f"  NumFeats: {features.shape[1]}")
+            print(f"  NumClasses: {self._num_classes}")
+
+    def __getitem__(self, idx):
+        assert idx == 0, "AmazonPhotoDataset only contains one graph"
+        if self._g is None:
+            raise ValueError("Graph has not been loaded or processed correctly.")
+        return self._g if self._transform is None else self._transform(self._g)
+
+    def __len__(self):
+        return 1
+
+    @property
+    def num_classes(self):
+        return self._num_classes
diff --git a/easygraph/datasets/arxiv.py b/easygraph/datasets/arxiv.py
@@ -0,0 +1,106 @@
+"""Arxiv HEP-TH Citation Network
+
+This dataset represents the citation network of preprints from the High Energy Physics - Theory (HEP-TH) category on arXiv, covering the period from January 1993 to April 2003.
+
+Each node corresponds to a paper, and a directed edge from paper A to paper B indicates that A cites B.
+
+No features or labels are included in this dataset.
+
+Statistics:
+- Nodes: 27,770
+- Edges: 352,807
+- Features: None
+- Labels: None
+
+Reference:
+J. Leskovec, J. Kleinberg and C. Faloutsos, "Graphs over Time: Densification Laws, Shrinking Diameters and Possible Explanations,"
+in KDD 2005. Dataset: https://snap.stanford.edu/data/cit-HepTh.html
+"""
+
+import gzip
+import os
+import shutil
+
+import easygraph as eg
+
+from easygraph.classes.graph import Graph
+
+from .graph_dataset_base import EasyGraphBuiltinDataset
+from .utils import download
+
+
+class ArxivHEPTHDataset(EasyGraphBuiltinDataset):
+    r"""Arxiv HEP-TH citation network dataset.
+
+    Parameters
+    ----------
+    raw_dir : str, optional
+        Directory to store the raw downloaded files. Default: None
+    force_reload : bool, optional
+        Whether to re-download and process the dataset. Default: False
+    verbose : bool, optional
+        Whether to print detailed processing logs. Default: True
+    transform : callable, optional
+        Optional transform to apply on the graph.
+
+    Examples
+    --------
+    >>> from easygraph.datasets import ArxivHEPTHDataset
+    >>> dataset = ArxivHEPTHDataset()
+    >>> g = dataset[0]
+    >>> print("Nodes:", g.number_of_nodes())
+    >>> print("Edges:", g.number_of_edges())
+    """
+
+    def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None):
+        name = "cit-HepTh"
+        url = "https://snap.stanford.edu/data/cit-HepTh.txt.gz"
+        super(ArxivHEPTHDataset, self).__init__(
+            name=name,
+            url=url,
+            raw_dir=raw_dir,
+            force_reload=force_reload,
+            verbose=verbose,
+            transform=transform,
+        )
+
+    def download(self):
+        r"""Download and decompress the .txt.gz file."""
+        compressed_path = os.path.join(self.raw_dir, self.name + ".txt.gz")
+        extracted_path = os.path.join(self.raw_path, self.name + ".txt")
+
+        download(self.url, path=compressed_path)
+
+        if not os.path.exists(self.raw_path):
+            os.makedirs(self.raw_path)
+
+        with gzip.open(compressed_path, "rb") as f_in:
+            with open(extracted_path, "wb") as f_out:
+                shutil.copyfileobj(f_in, f_out)
+
+    def process(self):
+        graph = eg.DiGraph()  # Citation network is directed
+        edge_list_path = os.path.join(self.raw_path, self.name + ".txt")
+
+        with open(edge_list_path, "r") as f:
+            for line in f:
+                if line.startswith("#") or line.strip() == "":
+                    continue
+                u, v = map(int, line.strip().split())
+                graph.add_edge(u, v)
+
+        self._g = graph
+        self._num_nodes = graph.number_of_nodes()
+        self._num_edges = graph.number_of_edges()
+
+        if self.verbose:
+            print("Finished loading Arxiv HEP-TH dataset.")
+            print(f"  NumNodes: {self._num_nodes}")
+            print(f"  NumEdges: {self._num_edges}")
+
+    def __getitem__(self, idx):
+        assert idx == 0, "ArxivHEPTHDataset only contains one graph"
+        return self._g if self._transform is None else self._transform(self._g)
+
+    def __len__(self):
+        return 1
diff --git a/easygraph/datasets/citation_graph.py b/easygraph/datasets/citation_graph.py
@@ -1,6 +1,5 @@
-"""Cora, citeseer, pubmed dataset.
+"""Cora, citeseer, pubmed dataset."""
 
-"""
 from __future__ import absolute_import
 
 import os
@@ -53,9 +52,10 @@ class CitationGraphDataset(EasyGraphBuiltinDataset):
     reorder : bool
         Whether to reorder the graph using :func:`~eg.reorder_graph`. Default: False.
     """
+
     _urls = {
         "cora_v2": "dataset/cora_v2.zip",
-        "citeseer": "dataset/citeSeer.zip",
+        "citeseer": "dataset/citeseer.zip",
         "pubmed": "dataset/pubmed.zip",
     }
 

diff --git a/easygraph/datasets/coauthor.py b/easygraph/datasets/coauthor.py
@@ -0,0 +1,118 @@
+"""CoauthorCS Dataset
+
+This dataset contains a co-authorship network of authors who submitted papers to CS category.
+Each node represents an author and edges represent co-authorships.
+Node features are bag-of-words representations of keywords in the author's papers.
+The task is node classification, with labels indicating the primary field of study.
+
+Statistics:
+- Nodes: 18333
+- Edges: 81894
+- Feature Dim: 6805
+- Classes: 15
+
+Source: https://github.com/dmlc/dgl/tree/master/examples/pytorch/cluster_gcn
+"""
+
+import os
+
+import easygraph as eg
+import numpy as np
+import scipy.sparse as sp
+
+from easygraph.classes.graph import Graph
+
+from .graph_dataset_base import EasyGraphBuiltinDataset
+from .utils import data_type_dict
+from .utils import download
+from .utils import extract_archive
+from .utils import tensor
+
+
+class CoauthorCSDataset(EasyGraphBuiltinDataset):
+    r"""CoauthorCS citation network dataset.
+
+    Nodes are authors, and edges indicate co-authorship relationships. Each node
+    has a bag-of-words feature vector and a label denoting the primary research field.
+
+    Parameters
+    ----------
+    raw_dir : str, optional
+        Directory to store the raw downloaded files. Default: None
+    force_reload : bool, optional
+        Whether to re-download and process the dataset. Default: False
+    verbose : bool, optional
+        Whether to print detailed processing logs. Default: True
+    transform : callable, optional
+        Transform to apply to the graph on access.
+
+    Examples
+    --------
+    >>> from easygraph.datasets import CoauthorCSDataset
+    >>> dataset = CoauthorCSDataset()
+    >>> g = dataset[0]
+    >>> print("Nodes:", g.number_of_nodes())
+    >>> print("Edges:", g.number_of_edges())
+    >>> print("Feature shape:", g.nodes[0]['feat'].shape)
+    >>> print("Label:", g.nodes[0]['label'])
+    >>> print("Number of classes:", dataset.num_classes)
+    """
+
+    def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None):
+        name = "coauthor_cs"
+        url = "https://data.dgl.ai/dataset/coauthor_cs.zip"
+        super(CoauthorCSDataset, self).__init__(
+            name=name,
+            url=url,
+            raw_dir=raw_dir,
+            force_reload=force_reload,
+            verbose=verbose,
+            transform=transform,
+        )
+
+    def process(self):
+        path = os.path.join(self.raw_path, "coauthor_cs.npz")
+        data = np.load(path)
+
+        # Reconstruct adjacency matrix
+        adj = sp.csr_matrix(
+            (data["adj_data"], data["adj_indices"], data["adj_indptr"]),
+            shape=data["adj_shape"],
+        )
+
+        # Reconstruct feature matrix
+        features = sp.csr_matrix(
+            (data["attr_data"], data["attr_indices"], data["attr_indptr"]),
+            shape=data["attr_shape"],
+        ).todense()
+
+        labels = data["labels"]
+
+        g = eg.Graph()
+        g.add_edges_from(list(zip(*adj.nonzero())))
+
+        for i in range(features.shape[0]):
+            g.add_node(i, feat=np.array(features[i]).squeeze(), label=int(labels[i]))
+
+        self._g = g
+        self._num_classes = len(np.unique(labels))
+
+        if self.verbose:
+            print("Finished loading CoauthorCS dataset.")
+            print(f"  NumNodes: {g.number_of_nodes()}")
+            print(f"  NumEdges: {g.number_of_edges()}")
+            print(f"  NumFeats: {features.shape[1]}")
+            print(f"  NumClasses: {self._num_classes}")
+
+    def __getitem__(self, idx):
+        assert idx == 0, "CoauthorCSDataset only contains one graph"
+        if self._g is None:
+            raise ValueError("Graph has not been loaded or processed correctly.")
+        return self._g if self._transform is None else self._transform(self._g)
+
+    def __len__(self):
+        return 1
+
+    @property
+    def num_classes(self):
+        return self._num_classes