Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 18 additions & 4 deletions github_activity/github_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,8 @@ def get_activity(
-------
query_data : pandas DataFrame
A munged collection of data returned from your query. This
will be a combination of issues and PRs.
will be a combination of issues and PRs. The DataFrame has a
`bot_users` attribute containing the set of detected bot usernames.
"""

org, repo = _parse_target(target)
Expand Down Expand Up @@ -206,13 +207,16 @@ def get_activity(
# Query for both opened and closed issues/PRs in this window
print(f"Running search query:\n{search_query}\n\n", file=sys.stderr)
query_data = []
all_bot_users = set()
for activity_type in ["created", "closed"]:
ii_search_query = (
search_query + f" {activity_type}:{since_dt_str}..{until_dt_str}"
)
qu = GitHubGraphQlQuery(ii_search_query, auth=auth)
qu.request()
query_data.append(qu.data)
# Collect bot users from each query
all_bot_users.update(qu.bot_users)

query_data = (
pd.concat(query_data).drop_duplicates(subset=["id"]).reset_index(drop=True)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this I think was clobberying bot_users so we are now explicitly grabbing it in a set after each query

Expand All @@ -223,9 +227,12 @@ def get_activity(
query_data.until_dt_str = until_dt_str
query_data.since_is_git_ref = since_is_git_ref
query_data.until_is_git_ref = until_is_git_ref
# Restore bot_users in attrs (lost during concat)
query_data.attrs["bot_users"] = all_bot_users

if cache:
_cache_data(query_data, cache)

return query_data


Expand Down Expand Up @@ -462,7 +469,7 @@ def generate_activity_md(
data["contributors"] = [[]] * len(data)

# Get bot users from GraphQL data (stored in DataFrame attrs)
bot_users = data.attrs.get("bot_users", set())
bot_users = data.attrs["bot_users"]

def ignored_user(username):
if username in bot_users:
Expand Down Expand Up @@ -490,12 +497,19 @@ def filter_ignored(userlist):
# - merger
# - reviewers

item_contributors.author = row.author
# Only add author if they're not a bot
if not ignored_user(row.author):
item_contributors.author = row.author

if row.kind == "pr":
for committer in filter_ignored(row.committers):
item_contributors.add(committer)
if row.mergedBy and row.mergedBy != row.author:
# Only add merger if they're not a bot and not the author
if (
row.mergedBy
and row.mergedBy != row.author
and not ignored_user(row.mergedBy)
):
item_contributors.add(row.mergedBy)
for reviewer in filter_ignored(row.reviewers):
item_contributors.add(reviewer)
Expand Down
30 changes: 25 additions & 5 deletions github_activity/graphql.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,15 @@
committer {
user {
login
__typename
}
}
authors(first: 10) {
edges {
node {
user {
login
__typename
}
}
}
Expand Down Expand Up @@ -140,6 +142,7 @@ def __init__(self, query, display_progress=True, auth=None):
variable `GITHUB_ACCESS_TOKEN` will be tried.
"""
self.query = query
self.bot_users = set() # Store detected bot usernames

# Authentication
token = auth or os.environ.get("GITHUB_ACCESS_TOKEN")
Expand All @@ -149,7 +152,7 @@ def __init__(self, query, display_progress=True, auth=None):
"--auth flag or must be used to pass a Personal Access Token "
"needed by the GitHub API. You can generate a token at "
"https://github.com/settings/tokens/new. Note that while "
"working with a public repository, you dont need to set any "
"working with a public repository, you don't need to set any "
"scopes on the token you create."
)
self.auth = TokenAuth(token)
Expand Down Expand Up @@ -240,9 +243,7 @@ def request(self, n_pages=100, n_per_page=50):
# Extract bot users from raw data before DataFrame conversion
def is_bot(user_dict):
"""Check if a GraphQL user object represents a bot account."""
if not user_dict:
return False
return user_dict.get("__typename") == "Bot"
return user_dict and user_dict.get("__typename") == "Bot"

bot_users = set()
for item in self.issues_and_or_prs:
Expand Down Expand Up @@ -272,10 +273,29 @@ def is_bot(user_dict):
if is_bot(comment_author):
bot_users.add(comment_author["login"])

# Check commit authors and committers
commits = item.get("commits")
if commits:
for commit_edge in commits.get("edges", []):
commit = commit_edge["node"]["commit"]
# Check committer
committer = commit.get("committer")
if committer and committer.get("user"):
if is_bot(committer["user"]):
bot_users.add(committer["user"]["login"])
# Check authors
authors = commit.get("authors")
if authors:
for author_edge in authors.get("edges", []):
author_user = author_edge["node"].get("user")
if author_user and is_bot(author_user):
bot_users.add(author_user["login"])

# Create a dataframe of the issues and/or PRs
self.data = pd.DataFrame(self.issues_and_or_prs)
# Store bot users in DataFrame metadata (attrs dict)
# Store bot users in DataFrame attrs and as instance attribute
self.data.attrs["bot_users"] = bot_users
self.bot_users = bot_users

# Add some extra fields
def get_login(user):
Expand Down
27 changes: 27 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,30 @@ def test_contributor_sorting(tmpdir, file_regression):
run(cmd.split(), check=True)
md = path_output.read_text()
file_regression.check(md, extension=".md")


def test_bot_filtering(tmpdir):
"""Test that bot users are detected and filtered from output."""
from github_activity.github_activity import get_activity, generate_activity_md

# Use jupyter-book/mystmd because it's a small release, and know theres bot activity
data = get_activity(
target="jupyter-book/mystmd",
since="mystmd@1.6.5",
until="mystmd@1.6.6",
)

# Verify bot_users attrs exists and was preserved (catches the concat bug)
assert "bot_users" in data.attrs, "bot_users should be in DataFrame attrs"

# Generate markdown and verify no bots appear
md = generate_activity_md(
target="jupyter-book/mystmd",
since="mystmd@1.6.5",
until="mystmd@1.6.6",
)

# Ensure changeset-bot is not anywhere in the output
assert "changeset-bot" not in md, (
"changeset-bot should not appear anywhere in output"
)