Skip to content

Commit b702d1b

Browse files
committed
Ensure bots propagate through concat
1 parent 7a13fb7 commit b702d1b

File tree

3 files changed

+70
-9
lines changed

3 files changed

+70
-9
lines changed

github_activity/github_activity.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,8 @@ def get_activity(
134134
-------
135135
query_data : pandas DataFrame
136136
A munged collection of data returned from your query. This
137-
will be a combination of issues and PRs.
137+
will be a combination of issues and PRs. The DataFrame has a
138+
`bot_users` attribute containing the set of detected bot usernames.
138139
"""
139140

140141
org, repo = _parse_target(target)
@@ -206,13 +207,16 @@ def get_activity(
206207
# Query for both opened and closed issues/PRs in this window
207208
print(f"Running search query:\n{search_query}\n\n", file=sys.stderr)
208209
query_data = []
210+
all_bot_users = set()
209211
for activity_type in ["created", "closed"]:
210212
ii_search_query = (
211213
search_query + f" {activity_type}:{since_dt_str}..{until_dt_str}"
212214
)
213215
qu = GitHubGraphQlQuery(ii_search_query, auth=auth)
214216
qu.request()
215217
query_data.append(qu.data)
218+
# Collect bot users from each query
219+
all_bot_users.update(qu.bot_users)
216220

217221
query_data = (
218222
pd.concat(query_data).drop_duplicates(subset=["id"]).reset_index(drop=True)
@@ -223,9 +227,12 @@ def get_activity(
223227
query_data.until_dt_str = until_dt_str
224228
query_data.since_is_git_ref = since_is_git_ref
225229
query_data.until_is_git_ref = until_is_git_ref
230+
# Restore bot_users in attrs (lost during concat)
231+
query_data.attrs["bot_users"] = all_bot_users
226232

227233
if cache:
228234
_cache_data(query_data, cache)
235+
229236
return query_data
230237

231238

@@ -462,7 +469,7 @@ def generate_activity_md(
462469
data["contributors"] = [[]] * len(data)
463470

464471
# Get bot users from GraphQL data (stored in DataFrame attrs)
465-
bot_users = data.attrs.get("bot_users", set())
472+
bot_users = data.attrs["bot_users"]
466473

467474
def ignored_user(username):
468475
if username in bot_users:
@@ -490,12 +497,19 @@ def filter_ignored(userlist):
490497
# - merger
491498
# - reviewers
492499

493-
item_contributors.author = row.author
500+
# Only add author if they're not a bot
501+
if not ignored_user(row.author):
502+
item_contributors.author = row.author
494503

495504
if row.kind == "pr":
496505
for committer in filter_ignored(row.committers):
497506
item_contributors.add(committer)
498-
if row.mergedBy and row.mergedBy != row.author:
507+
# Only add merger if they're not a bot and not the author
508+
if (
509+
row.mergedBy
510+
and row.mergedBy != row.author
511+
and not ignored_user(row.mergedBy)
512+
):
499513
item_contributors.add(row.mergedBy)
500514
for reviewer in filter_ignored(row.reviewers):
501515
item_contributors.add(reviewer)

github_activity/graphql.py

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,13 +47,15 @@
4747
committer {
4848
user {
4949
login
50+
__typename
5051
}
5152
}
5253
authors(first: 10) {
5354
edges {
5455
node {
5556
user {
5657
login
58+
__typename
5759
}
5860
}
5961
}
@@ -140,6 +142,7 @@ def __init__(self, query, display_progress=True, auth=None):
140142
variable `GITHUB_ACCESS_TOKEN` will be tried.
141143
"""
142144
self.query = query
145+
self.bot_users = set() # Store detected bot usernames
143146

144147
# Authentication
145148
token = auth or os.environ.get("GITHUB_ACCESS_TOKEN")
@@ -149,7 +152,7 @@ def __init__(self, query, display_progress=True, auth=None):
149152
"--auth flag or must be used to pass a Personal Access Token "
150153
"needed by the GitHub API. You can generate a token at "
151154
"https://github.com/settings/tokens/new. Note that while "
152-
"working with a public repository, you dont need to set any "
155+
"working with a public repository, you don't need to set any "
153156
"scopes on the token you create."
154157
)
155158
self.auth = TokenAuth(token)
@@ -240,9 +243,7 @@ def request(self, n_pages=100, n_per_page=50):
240243
# Extract bot users from raw data before DataFrame conversion
241244
def is_bot(user_dict):
242245
"""Check if a GraphQL user object represents a bot account."""
243-
if not user_dict:
244-
return False
245-
return user_dict.get("__typename") == "Bot"
246+
return user_dict and user_dict.get("__typename") == "Bot"
246247

247248
bot_users = set()
248249
for item in self.issues_and_or_prs:
@@ -272,10 +273,29 @@ def is_bot(user_dict):
272273
if is_bot(comment_author):
273274
bot_users.add(comment_author["login"])
274275

276+
# Check commit authors and committers
277+
commits = item.get("commits")
278+
if commits:
279+
for commit_edge in commits.get("edges", []):
280+
commit = commit_edge["node"]["commit"]
281+
# Check committer
282+
committer = commit.get("committer")
283+
if committer and committer.get("user"):
284+
if is_bot(committer["user"]):
285+
bot_users.add(committer["user"]["login"])
286+
# Check authors
287+
authors = commit.get("authors")
288+
if authors:
289+
for author_edge in authors.get("edges", []):
290+
author_user = author_edge["node"].get("user")
291+
if author_user and is_bot(author_user):
292+
bot_users.add(author_user["login"])
293+
275294
# Create a dataframe of the issues and/or PRs
276295
self.data = pd.DataFrame(self.issues_and_or_prs)
277-
# Store bot users in DataFrame metadata (attrs dict)
296+
# Store bot users in DataFrame attrs and as instance attribute
278297
self.data.attrs["bot_users"] = bot_users
298+
self.bot_users = bot_users
279299

280300
# Add some extra fields
281301
def get_login(user):

tests/test_cli.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,3 +141,30 @@ def test_contributor_sorting(tmpdir, file_regression):
141141
run(cmd.split(), check=True)
142142
md = path_output.read_text()
143143
file_regression.check(md, extension=".md")
144+
145+
146+
def test_bot_filtering(tmpdir):
147+
"""Test that bot users are detected and filtered from output."""
148+
from github_activity.github_activity import get_activity, generate_activity_md
149+
150+
# Use jupyter-book/mystmd because it's a small release, and know theres bot activity
151+
data = get_activity(
152+
target="jupyter-book/mystmd",
153+
since="mystmd@1.6.5",
154+
until="mystmd@1.6.6",
155+
)
156+
157+
# Verify bot_users attrs exists and was preserved (catches the concat bug)
158+
assert "bot_users" in data.attrs, "bot_users should be in DataFrame attrs"
159+
160+
# Generate markdown and verify no bots appear
161+
md = generate_activity_md(
162+
target="jupyter-book/mystmd",
163+
since="mystmd@1.6.5",
164+
until="mystmd@1.6.6",
165+
)
166+
167+
# Ensure changeset-bot is not anywhere in the output
168+
assert "changeset-bot" not in md, (
169+
"changeset-bot should not appear anywhere in output"
170+
)

0 commit comments

Comments
 (0)