Spaces:
Sleeping
Sleeping
| """ | |
| This application enables exploration with data from the paper: | |
| 4.5 Million (Suspected) Fake Stars in GitHub: A Growing Spiral of Popularity Contests, Scams, and Malware | |
| https://arxiv.org/abs/2412.13459 | |
| Requires the following packages | |
| pip install streamlit | |
| """ | |
| import os | |
| import pandas as pd | |
| import streamlit as st | |
| class Application: | |
| """ | |
| Main application. | |
| """ | |
| def __init__(self): | |
| """ | |
| Creates a new application. | |
| """ | |
| # Load data from GitHub project | |
| self.data = self.load() | |
| def load(self): | |
| """ | |
| Loads data from the source GitHub project. | |
| Returns: | |
| dataframe | |
| """ | |
| # Read data | |
| version = "241001" | |
| clustered = pd.read_csv(f"https://github.com/hehao98/StarScout/raw/refs/heads/main/data/{version}/fake_stars_clustered_stars_by_month.csv") | |
| activity = pd.read_csv(f"https://github.com/hehao98/StarScout/raw/refs/heads/main/data/{version}/fake_stars_low_activity_stars_by_month.csv") | |
| data = pd.merge(clustered, activity, how="outer", on=["repo", "month"]) | |
| # Remove duplicate stars column | |
| data["n_stars"] = pd.to_numeric(data[["n_stars_x", "n_stars_y"]].max(axis=1), downcast="integer") | |
| data = data.drop(["n_stars_x", "n_stars_y"], axis=1) | |
| # Aggregate fake star counts | |
| data["n_stars_clustered"] = pd.to_numeric(data["n_stars_clustered"].fillna(0), downcast="integer") | |
| data["n_stars_low_activity"] = pd.to_numeric(data["n_stars_low_activity"].fillna(0), downcast="integer") | |
| data["n_stars_flagged"] = data["n_stars_clustered"] + data["n_stars_low_activity"] | |
| data["n_stars_flagged"] = pd.to_numeric(data[["n_stars", "n_stars_flagged"]].min(axis=1), downcast="integer") | |
| # Calculate stat columns | |
| data["n_flagged_percent"] = 100 * (data["n_stars_flagged"] / data["n_stars"]) | |
| # Rename and organize columns | |
| data.columns = ["repo", "month", "clustered", "low activity", "total stars", "flagged stars", "flagged %"] | |
| return data[["repo", "month", "clustered", "low activity", "flagged stars", "total stars", "flagged %"]] | |
| def run(self): | |
| """ | |
| Main rendering logic. | |
| """ | |
| # List of GitHub repos | |
| repos = st.text_area("**GitHub Repos, one per line**") | |
| # Format input | |
| repos = self.parse(repos) | |
| if repos: | |
| # Get top result per project | |
| frames = [] | |
| for repo in repos: | |
| df = self.data[self.data["repo"].str.lower() == repo.lower()].sort_values("flagged stars", ascending=False)[:1] | |
| frames.append(df) | |
| # Aggregate into single data frame and display | |
| aggregate = pd.concat(frames, axis=0) | |
| aggregate = aggregate.sort_values("flagged %", ascending=False).reset_index(drop=True) | |
| st.markdown("**Top month flagged by project**") | |
| st.dataframe( | |
| data=aggregate, | |
| column_config={ | |
| "flagged %": st.column_config.NumberColumn( | |
| format="%.2f %%" | |
| ) | |
| }, | |
| use_container_width=True | |
| ) | |
| for repo in aggregate["repo"]: | |
| st.markdown(f"**{repo}**") | |
| st.line_chart( | |
| data=self.data[self.data["repo"].str.lower() == repo.lower()].sort_values("month"), | |
| x="month", | |
| y=["total stars", "flagged stars"], | |
| color=["#F44336", "#2196F3"], | |
| ) | |
| def parse(self, repos): | |
| """ | |
| Parses and cleans the input repos string. | |
| Returns: | |
| list of repos | |
| """ | |
| outputs = [] | |
| for repo in repos.split("\n"): | |
| repo = repo.replace("https://github.com/", "") | |
| if repo: | |
| outputs.append(repo) | |
| return outputs | |
| def create(): | |
| """ | |
| Creates and caches a Streamlit application. | |
| Returns: | |
| Application | |
| """ | |
| return Application() | |
| if __name__ == "__main__": | |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
| st.set_page_config( | |
| page_title="4.5 Million (Suspected) Fake Stars in GitHub", | |
| page_icon="⭐", | |
| layout="centered", | |
| initial_sidebar_state="auto", | |
| menu_items=None, | |
| ) | |
| st.markdown("## 4.5 Million (Suspected) Fake ⭐'s in GitHub") | |
| st.markdown( | |
| """ | |
| This application explores the data provided by the paper titled: | |
| _4.5 Million (Suspected) Fake Stars in GitHub: A Growing Spiral of Popularity Contests, Scams, and Malware_ | |
| _[Paper](https://arxiv.org/abs/2412.13459) | [GitHub Project](https://github.com/hehao98/StarScout)_ | |
| Note the disclaimer from the paper's authors. | |
| **Disclaimer**. _As we discussed in Section 3.4 and 3.5 in our paper, the resulting dataset are only repositories and users with suspected | |
| fake stars. The individual repositories and users in our dataset may be false positives. The main purpose of our dataset is for statistical | |
| analyses (which tolerates noises reasonably well), not for publicly shaming individual repositories. If you intend to publish subsequent work | |
| based on our dataset, please be aware of this limitation and its ethical implications._ | |
| To add to the authors disclaimer. | |
| _It's also worth noting that projects that trend on popular sites such as the GitHub Trending Page can attract a lot of automated behavior outside | |
| of a project's control. This dataset is just a data point that shouldn't be used in a vacuum._ | |
| """ | |
| ) | |
| # Create and run application | |
| app = create() | |
| app.run() | |