threads-net-posts-profiles-scraper-python/main.py at main · PRO100CHOK/threads-net-posts-profiles-scraper-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
"""Threads.net Scraper — brand-mention monitoring demo.

Use case: search Threads for posts mentioning a brand keyword, capture each
post's author, text, likes, replies, and timestamps. Run on a cron to keep a
rolling mention log. Outputs JSON + CSV.

Run:
    pip install -r requirements.txt
    cp .env.example .env
    python main.py

Actor: https://apify.com/pro100chok/threads-scraper-usage?utm_source=github_pro100&utm_medium=readme&utm_campaign=threads-net-posts-profiles-scraper-python
"""
from __future__ import annotations

import csv
import json
import os
import sys
from pathlib import Path

from apify_client import ApifyClient
from apify_client.errors import ApifyApiError
from dotenv import load_dotenv

QUERIES = ["claude code", "anthropic"]
MAX_POSTS_PER_QUERY = 75

ACTOR_ID = "pro100chok/threads-scraper-usage"


def main() -> int:
    load_dotenv()
    token = os.environ.get("APIFY_API_TOKEN")
    if not token:
        sys.exit("APIFY_API_TOKEN missing — copy .env.example to .env first.")

    client = ApifyClient(token)
    run_input = {
        "action": "search",
        "queries": QUERIES,
        "serp_type": "default",
        "maxItems": MAX_POSTS_PER_QUERY,
        "proxy": {
            "useApifyProxy": True,
            "apifyProxyGroups": ["RESIDENTIAL"],
            "apifyProxyCountry": "US",
        },
    }

    print(f"Searching Threads for {len(QUERIES)} keywords, up to {MAX_POSTS_PER_QUERY} posts each...")
    try:
        run = client.actor(ACTOR_ID).call(run_input=run_input, timeout_secs=900)
    except ApifyApiError as exc:
        sys.exit(f"Actor call failed: {exc}")

    if not run or run.get("status") != "SUCCEEDED":
        sys.exit(f"Run not succeeded: {run.get('status') if run else 'no run'}")

    items = list(client.dataset(run["defaultDatasetId"]).iterate_items())
    if not items:
        sys.exit("Empty dataset — see Apify run log.")

    out = Path(__file__).parent
    (out / "output.json").write_text(json.dumps(items, indent=2, ensure_ascii=False), encoding="utf-8")

    rows = [
        {
            "matched_query": p.get("query"),
            "post_url": p.get("url") or p.get("permalink"),
            "username": (p.get("user") or {}).get("username") or p.get("username"),
            "text": (p.get("text") or p.get("caption") or "")[:280],
            "likes": p.get("likeCount") or p.get("likes"),
            "replies": p.get("replyCount") or p.get("replies"),
            "reposts": p.get("repostCount") or p.get("reposts"),
            "published_at": p.get("publishedAt") or p.get("createdAt"),
        }
        for p in items
    ]

    with (out / "output.csv").open("w", newline="", encoding="utf-8") as fh:
        w = csv.DictWriter(fh, fieldnames=list(rows[0].keys()))
        w.writeheader()
        w.writerows(rows)

    print(f"\nCaptured {len(rows)} mentions → output.json + output.csv\n")
    by_query: dict[str, int] = {}
    for r in rows:
        by_query[r["matched_query"] or "?"] = by_query.get(r["matched_query"] or "?", 0) + 1
    for q, n in by_query.items():
        print(f"  {q!r}: {n} posts")

    print("\nMost-liked mention per query:")
    seen: set[str] = set()
    for r in sorted(rows, key=lambda x: x["likes"] or 0, reverse=True):
        q = r["matched_query"] or ""
        if q in seen:
            continue
        seen.add(q)
        print(f"  [{q}] @{r['username']} — {r['likes']}❤  {r['text'][:120]}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())