Spaces:
Runtime error
Runtime error
| import json | |
| import streamlit as st | |
| from datasets import load_dataset | |
| from streamlit_folium import folium_static | |
| from catalogue import make_choro_map, region_tree | |
| ################## | |
| ## streamlit | |
| ################## | |
| st.set_page_config( | |
| page_title="BigScience Language Resource Catalogue Input Form", | |
| page_icon="https://avatars.githubusercontent.com/u/82455566", | |
| layout="wide", | |
| initial_sidebar_state="auto", | |
| ) | |
| query_params = st.experimental_get_query_params() | |
| def main(): | |
| if "save_state" not in st.session_state: | |
| st.session_state.save_state = {} | |
| viz_page() | |
| ################## | |
| ## SECTION: Explore the current catalogue | |
| ################## | |
| app_categories = { | |
| "entry_types": { | |
| "primary": "Primary source", | |
| "processed": "Processed language dataset", | |
| "organization": "Language organization or advocate", | |
| }, | |
| "language_lists": json.load( | |
| open("resources/language_lists.json", encoding="utf-8") | |
| ), | |
| "programming_languages": [ | |
| x | |
| for x in json.load( | |
| open("resources/programming_languages.json", encoding="utf-8") | |
| )["itemListElement"] | |
| ], | |
| "languages_bcp47": [ | |
| x | |
| for x in json.load(open("resources/bcp47.json", encoding="utf-8"))["subtags"] | |
| if x["type"] == "language" | |
| ], | |
| "custodian_types": [ | |
| "A private individual", | |
| "A commercial entity", | |
| "A library, museum, or archival institute", | |
| "A university or research institution", | |
| "A nonprofit/NGO (other)", | |
| "A government organization", | |
| ], | |
| "pii_categories": json.load( | |
| open("resources/pii_categories.json", encoding="utf-8") | |
| ), | |
| "licenses": json.load(open("resources/licenses.json", encoding="utf-8")), | |
| "primary_taxonomy": json.load( | |
| open("resources/primary_source_taxonomy.json", encoding="utf-8") | |
| ), | |
| "file_formats": json.load(open("resources/file_formats.json", encoding="utf-8")), | |
| } | |
| def filter_entry(entry, filter_dct): | |
| res = True | |
| for k, v in entry.items(): | |
| if k in filter_dct: | |
| if isinstance(v, dict): | |
| res = res and filter_entry(v, filter_dct[k]) | |
| elif isinstance(v, list): | |
| res = res and ( | |
| len(filter_dct[k]) == 0 or any([e in filter_dct[k] for e in v]) | |
| ) | |
| else: | |
| res = res and (len(filter_dct[k]) == 0 or v in filter_dct[k]) | |
| return res | |
| def filter_catalogue_visualization(catalogue, options): | |
| st.markdown("### Select entries to visualize") | |
| st.markdown( | |
| "##### Select entries by category, language, type of custodian or media" | |
| ) | |
| st.markdown( | |
| "You can select specific parts of the catalogue to visualize in this window." | |
| + " Leave a field empty to select all values, or select specific options to only select entries that have one of the chosen values." | |
| ) | |
| filter_by_options = [ | |
| "resource type", | |
| "language names", | |
| "custodian type", | |
| "available for download", | |
| "license type", | |
| "source type", | |
| "media type", | |
| ] | |
| filter_by = st.multiselect( | |
| key="viz_filter_by", | |
| label="You can filter the catalogue to only visualize entries that have certain properties, such as:", | |
| options=filter_by_options, | |
| ) | |
| filter_dict = {} | |
| if "resource type" in filter_by: | |
| filter_dict["type"] = st.multiselect( | |
| key="viz_filter_type", | |
| label="I want to only see entries that are of the following category:", | |
| options=options["entry_types"], | |
| format_func=lambda x: options["entry_types"][x], | |
| ) | |
| if "language names" in filter_by: | |
| filter_dict["languages"] = {} | |
| filter_dict["languages"]["language_names"] = st.multiselect( | |
| key="viz_filter_languages_language_names", | |
| label="I want to only see entries that have one of the following languages:", | |
| options=list(options["language_lists"]["language_groups"].keys()) | |
| + options["language_lists"]["niger_congo_languages"] | |
| + options["language_lists"]["indic_languages"], | |
| ) | |
| if "custodian type" in filter_by: | |
| filter_dict["custodian"] = {} | |
| filter_dict["custodian"]["type"] = st.multiselect( | |
| key="viz_filter_custodian_type", | |
| label="I want to only see entries that corresponds to organizations or to data that id owned/managed by organizations of the following types:", | |
| options=options["custodian_types"], | |
| ) | |
| if "available for download" in filter_by: | |
| filter_dict["availability"] = filter_dict.get("availability", {}) | |
| filter_dict["availability"]["procurement"] = {} | |
| download_options = [ | |
| "No - but the current owners/custodians have contact information for data queries", | |
| "No - we would need to spontaneously reach out to the current owners/custodians", | |
| "Yes - it has a direct download link or links", | |
| "Yes - after signing a user agreement", | |
| ] | |
| filter_dict["availability"]["procurement"]["for_download"] = st.multiselect( | |
| key="viz_availability_procurement_for_download", | |
| label="Select based on whether the data can be obtained online:", | |
| options=download_options, | |
| ) | |
| if "license type" in filter_by: | |
| filter_dict["availability"] = filter_dict.get("availability", {}) | |
| filter_dict["availability"]["licensing"] = {} | |
| filter_dict["availability"]["licensing"]["license_properties"] = st.multiselect( | |
| key="viz_availability_licensing_license_properties", | |
| label="Select primary entries that have the following license types", | |
| options=[ | |
| "public domain", | |
| "multiple licenses", | |
| "copyright - all rights reserved", | |
| "open license", | |
| "research use", | |
| "non-commercial use", | |
| "do not distribute", | |
| ], | |
| ) | |
| primary_license_options = [ | |
| "Unclear / I don't know", | |
| "Yes - the source material has an open license that allows re-use", | |
| "Yes - the dataset has the same license as the source material", | |
| "Yes - the dataset curators have obtained consent from the source material owners", | |
| "No - the license of the source material actually prohibits re-use in this manner", | |
| ] | |
| filter_dict["processed_from_primary"] = filter_dict.get( | |
| "processed_from_primary", {} | |
| ) | |
| filter_dict["processed_from_primary"]["primary_license"] = st.multiselect( | |
| key="viz_processed_from_primary_primary_license", | |
| label="For datasets, selected based on: Is the license or commercial status of the source material compatible with the license of the dataset?", | |
| options=primary_license_options, | |
| ) | |
| if "source type" in filter_by: | |
| filter_dict["source_category"] = {} | |
| filter_dict["source_category"]["category_type"] = st.multiselect( | |
| key="viz_source_category_category_type", | |
| label="Select primary sources that correspond to:", | |
| options=["collection", "website"], | |
| ) | |
| filter_dict["source_category"]["category_web"] = st.multiselect( | |
| key="viz_source_category_category_web", | |
| label="Select web-based primary sources that contain:", | |
| options=options["primary_taxonomy"]["website"], | |
| ) | |
| filter_dict["source_category"]["category_media"] = st.multiselect( | |
| key="viz_source_category_category_media", | |
| label="Select primary sources that are collections of:", | |
| options=options["primary_taxonomy"]["collection"], | |
| ) | |
| filter_dict["processed_from_primary"] = filter_dict.get( | |
| "processed_from_primary", {} | |
| ) | |
| filter_dict["processed_from_primary"]["primary_types"] = st.multiselect( | |
| key="viz_processed_from_primary_primary_types", | |
| label="Select processed datasets whose primary sources contain:", | |
| options=[f"web | {w}" for w in options["primary_taxonomy"]["website"]] | |
| + options["primary_taxonomy"]["collection"], | |
| ) | |
| if "media type" in filter_by: | |
| filter_dict["media"] = {} | |
| filter_dict["media"]["category"] = st.multiselect( | |
| key="viz_media_category", | |
| label="Select language data resources that contain:", | |
| options=["text", "audiovisual", "image"], | |
| help="Media data provided with transcription should go into **text**, then select the *transcribed* option. PDFs that have pre-extracted text information should go into **text**, PDFs that need OCR should go into **images**, select the latter if you're unsure", | |
| ) | |
| filtered_catalogue = [ | |
| entry | |
| for entry in catalogue | |
| if filter_entry(entry, filter_dict) and not (entry["uid"] == "") | |
| ] | |
| st.markdown( | |
| f"##### Your query matched **{len(filtered_catalogue)}** entries in the current catalogue." | |
| ) | |
| return filtered_catalogue | |
| def viz_page(): | |
| st.title("🌸 - BigScience Catalog of Language Resources") | |
| st.markdown("---\n") | |
| catalogue = load_dataset("bigscience/collaborative_catalog")["train"] | |
| with st.sidebar: | |
| filtered_catalogue = filter_catalogue_visualization(catalogue, app_categories) | |
| entry_location_type = st.radio( | |
| label="I want to visualize", | |
| options=[ | |
| "Where the organizations or data custodians are located", | |
| "Where the language data creators are located", | |
| ], | |
| key="viz_show_location_type", | |
| ) | |
| show_by_org = ( | |
| entry_location_type | |
| == "Where the organizations or data custodians are located" | |
| ) | |
| with st.expander("Map of entries", expanded=True): | |
| filtered_counts = {} | |
| for entry in filtered_catalogue: | |
| locations = ( | |
| [entry["custodian"]["location"]] | |
| if show_by_org | |
| else entry["languages"]["language_locations"] | |
| ) | |
| # be as specific as possible | |
| locations = [ | |
| loc | |
| for loc in locations | |
| if not any([l in region_tree.get(loc, []) for l in locations]) | |
| ] | |
| for loc in locations: | |
| filtered_counts[loc] = filtered_counts.get(loc, 0) + 1 | |
| world_map = make_choro_map(filtered_counts) | |
| folium_static(world_map, width=900, height=600) | |
| with st.expander("View selected resources", expanded=False): | |
| st.write("You can further select locations to select entries from here:") | |
| filter_region_choices = sorted( | |
| set( | |
| [ | |
| loc | |
| for entry in filtered_catalogue | |
| for loc in ( | |
| [entry["custodian"]["location"]] | |
| if show_by_org | |
| else entry["languages"]["language_locations"] | |
| ) | |
| ] | |
| ) | |
| ) | |
| filter_locs = st.multiselect( | |
| "View entries from the following locations:", | |
| options=filter_region_choices, | |
| key="viz_select_location", | |
| ) | |
| filter_loc_dict = ( | |
| {"custodian": {"location": filter_locs}} | |
| if show_by_org | |
| else {"languages": {"language_locations": filter_locs}} | |
| ) | |
| filtered_catalogue_by_loc = [ | |
| entry | |
| for entry in filtered_catalogue | |
| if filter_entry(entry, filter_loc_dict) | |
| ] | |
| view_entry = st.selectbox( | |
| label="Select an entry to see more detail:", | |
| options=filtered_catalogue_by_loc, | |
| format_func=lambda entry: f"{entry['uid']} | {entry['description']['name']} -- {entry['description']['description']}", | |
| key="viz_select_entry", | |
| ) | |
| st.markdown( | |
| f"##### *Type:* {view_entry['type']} *UID:* {view_entry['uid']} - *Name:* {view_entry['description']['name']}\n\n{view_entry['description']['description']}" | |
| ) | |
| st.write(view_entry) | |
| if __name__ == "__main__": | |
| main() | |