-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
105 lines (85 loc) · 3.43 KB
/
main.py
File metadata and controls
105 lines (85 loc) · 3.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
"""Threads.net Scraper — brand-mention monitoring demo.
Use case: search Threads for posts mentioning a brand keyword, capture each
post's author, text, likes, replies, and timestamps. Run on a cron to keep a
rolling mention log. Outputs JSON + CSV.
Run:
pip install -r requirements.txt
cp .env.example .env
python main.py
Actor: https://apify.com/pro100chok/threads-scraper-usage?utm_source=github_pro100&utm_medium=readme&utm_campaign=threads-net-posts-profiles-scraper-python
"""
from __future__ import annotations
import csv
import json
import os
import sys
from pathlib import Path
from apify_client import ApifyClient
from apify_client.errors import ApifyApiError
from dotenv import load_dotenv
QUERIES = ["claude code", "anthropic"]
MAX_POSTS_PER_QUERY = 75
ACTOR_ID = "pro100chok/threads-scraper-usage"
def main() -> int:
load_dotenv()
token = os.environ.get("APIFY_API_TOKEN")
if not token:
sys.exit("APIFY_API_TOKEN missing — copy .env.example to .env first.")
client = ApifyClient(token)
run_input = {
"action": "search",
"queries": QUERIES,
"serp_type": "default",
"maxItems": MAX_POSTS_PER_QUERY,
"proxy": {
"useApifyProxy": True,
"apifyProxyGroups": ["RESIDENTIAL"],
"apifyProxyCountry": "US",
},
}
print(f"Searching Threads for {len(QUERIES)} keywords, up to {MAX_POSTS_PER_QUERY} posts each...")
try:
run = client.actor(ACTOR_ID).call(run_input=run_input, timeout_secs=900)
except ApifyApiError as exc:
sys.exit(f"Actor call failed: {exc}")
if not run or run.get("status") != "SUCCEEDED":
sys.exit(f"Run not succeeded: {run.get('status') if run else 'no run'}")
items = list(client.dataset(run["defaultDatasetId"]).iterate_items())
if not items:
sys.exit("Empty dataset — see Apify run log.")
out = Path(__file__).parent
(out / "output.json").write_text(json.dumps(items, indent=2, ensure_ascii=False), encoding="utf-8")
rows = [
{
"matched_query": p.get("query"),
"post_url": p.get("url") or p.get("permalink"),
"username": (p.get("user") or {}).get("username") or p.get("username"),
"text": (p.get("text") or p.get("caption") or "")[:280],
"likes": p.get("likeCount") or p.get("likes"),
"replies": p.get("replyCount") or p.get("replies"),
"reposts": p.get("repostCount") or p.get("reposts"),
"published_at": p.get("publishedAt") or p.get("createdAt"),
}
for p in items
]
with (out / "output.csv").open("w", newline="", encoding="utf-8") as fh:
w = csv.DictWriter(fh, fieldnames=list(rows[0].keys()))
w.writeheader()
w.writerows(rows)
print(f"\nCaptured {len(rows)} mentions → output.json + output.csv\n")
by_query: dict[str, int] = {}
for r in rows:
by_query[r["matched_query"] or "?"] = by_query.get(r["matched_query"] or "?", 0) + 1
for q, n in by_query.items():
print(f" {q!r}: {n} posts")
print("\nMost-liked mention per query:")
seen: set[str] = set()
for r in sorted(rows, key=lambda x: x["likes"] or 0, reverse=True):
q = r["matched_query"] or ""
if q in seen:
continue
seen.add(q)
print(f" [{q}] @{r['username']} — {r['likes']}❤ {r['text'][:120]}")
return 0
if __name__ == "__main__":
raise SystemExit(main())