Spaces:

bigscience
/

SourcingCatalog

Runtime error

SourcingCatalog / app.py

Yacine Jernite

narrower

0b3f7a0 over 3 years ago

12.4 kB

	import json

	import streamlit as st
	from datasets import load_dataset
	from streamlit_folium import folium_static

	from catalogue import make_choro_map, region_tree

	##################
	## streamlit
	##################
	st.set_page_config(
	page_title="BigScience Language Resource Catalogue Input Form",
	page_icon="https://avatars.githubusercontent.com/u/82455566",
	layout="wide",
	initial_sidebar_state="auto",
	)

	query_params = st.experimental_get_query_params()


	def main():
	if "save_state" not in st.session_state:
	st.session_state.save_state = {}

	viz_page()


	##################
	## SECTION: Explore the current catalogue
	##################

	app_categories = {
	"entry_types": {
	"primary": "Primary source",
	"processed": "Processed language dataset",
	"organization": "Language organization or advocate",
	},
	"language_lists": json.load(
	open("resources/language_lists.json", encoding="utf-8")
	),
	"programming_languages": [
	x
	for x in json.load(
	open("resources/programming_languages.json", encoding="utf-8")
	)["itemListElement"]
	],
	"languages_bcp47": [
	x
	for x in json.load(open("resources/bcp47.json", encoding="utf-8"))["subtags"]
	if x["type"] == "language"
	],
	"custodian_types": [
	"A private individual",
	"A commercial entity",
	"A library, museum, or archival institute",
	"A university or research institution",
	"A nonprofit/NGO (other)",
	"A government organization",
	],
	"pii_categories": json.load(
	open("resources/pii_categories.json", encoding="utf-8")
	),
	"licenses": json.load(open("resources/licenses.json", encoding="utf-8")),
	"primary_taxonomy": json.load(
	open("resources/primary_source_taxonomy.json", encoding="utf-8")
	),
	"file_formats": json.load(open("resources/file_formats.json", encoding="utf-8")),
	}


	def filter_entry(entry, filter_dct):
	res = True
	for k, v in entry.items():
	if k in filter_dct:
	if isinstance(v, dict):
	res = res and filter_entry(v, filter_dct[k])
	elif isinstance(v, list):
	res = res and (
	len(filter_dct[k]) == 0 or any([e in filter_dct[k] for e in v])
	)
	else:
	res = res and (len(filter_dct[k]) == 0 or v in filter_dct[k])
	return res


	def filter_catalogue_visualization(catalogue, options):
	st.markdown("### Select entries to visualize")
	st.markdown(
	"##### Select entries by category, language, type of custodian or media"
	)
	st.markdown(
	"You can select specific parts of the catalogue to visualize in this window."
	+ " Leave a field empty to select all values, or select specific options to only select entries that have one of the chosen values."
	)
	filter_by_options = [
	"resource type",
	"language names",
	"custodian type",
	"available for download",
	"license type",
	"source type",
	"media type",
	]
	filter_by = st.multiselect(
	key="viz_filter_by",
	label="You can filter the catalogue to only visualize entries that have certain properties, such as:",
	options=filter_by_options,
	)
	filter_dict = {}
	if "resource type" in filter_by:
	filter_dict["type"] = st.multiselect(
	key="viz_filter_type",
	label="I want to only see entries that are of the following category:",
	options=options["entry_types"],
	format_func=lambda x: options["entry_types"][x],
	)
	if "language names" in filter_by:
	filter_dict["languages"] = {}
	filter_dict["languages"]["language_names"] = st.multiselect(
	key="viz_filter_languages_language_names",
	label="I want to only see entries that have one of the following languages:",
	options=list(options["language_lists"]["language_groups"].keys())
	+ options["language_lists"]["niger_congo_languages"]
	+ options["language_lists"]["indic_languages"],
	)
	if "custodian type" in filter_by:
	filter_dict["custodian"] = {}
	filter_dict["custodian"]["type"] = st.multiselect(
	key="viz_filter_custodian_type",
	label="I want to only see entries that corresponds to organizations or to data that id owned/managed by organizations of the following types:",
	options=options["custodian_types"],
	)
	if "available for download" in filter_by:
	filter_dict["availability"] = filter_dict.get("availability", {})
	filter_dict["availability"]["procurement"] = {}
	download_options = [
	"No - but the current owners/custodians have contact information for data queries",
	"No - we would need to spontaneously reach out to the current owners/custodians",
	"Yes - it has a direct download link or links",
	"Yes - after signing a user agreement",
	]
	filter_dict["availability"]["procurement"]["for_download"] = st.multiselect(
	key="viz_availability_procurement_for_download",
	label="Select based on whether the data can be obtained online:",
	options=download_options,
	)
	if "license type" in filter_by:
	filter_dict["availability"] = filter_dict.get("availability", {})
	filter_dict["availability"]["licensing"] = {}
	filter_dict["availability"]["licensing"]["license_properties"] = st.multiselect(
	key="viz_availability_licensing_license_properties",
	label="Select primary entries that have the following license types",
	options=[
	"public domain",
	"multiple licenses",
	"copyright - all rights reserved",
	"open license",
	"research use",
	"non-commercial use",
	"do not distribute",
	],
	)
	primary_license_options = [
	"Unclear / I don't know",
	"Yes - the source material has an open license that allows re-use",
	"Yes - the dataset has the same license as the source material",
	"Yes - the dataset curators have obtained consent from the source material owners",
	"No - the license of the source material actually prohibits re-use in this manner",
	]
	filter_dict["processed_from_primary"] = filter_dict.get(
	"processed_from_primary", {}
	)
	filter_dict["processed_from_primary"]["primary_license"] = st.multiselect(
	key="viz_processed_from_primary_primary_license",
	label="For datasets, selected based on: Is the license or commercial status of the source material compatible with the license of the dataset?",
	options=primary_license_options,
	)
	if "source type" in filter_by:
	filter_dict["source_category"] = {}
	filter_dict["source_category"]["category_type"] = st.multiselect(
	key="viz_source_category_category_type",
	label="Select primary sources that correspond to:",
	options=["collection", "website"],
	)
	filter_dict["source_category"]["category_web"] = st.multiselect(
	key="viz_source_category_category_web",
	label="Select web-based primary sources that contain:",
	options=options["primary_taxonomy"]["website"],
	)
	filter_dict["source_category"]["category_media"] = st.multiselect(
	key="viz_source_category_category_media",
	label="Select primary sources that are collections of:",
	options=options["primary_taxonomy"]["collection"],
	)
	filter_dict["processed_from_primary"] = filter_dict.get(
	"processed_from_primary", {}
	)
	filter_dict["processed_from_primary"]["primary_types"] = st.multiselect(
	key="viz_processed_from_primary_primary_types",
	label="Select processed datasets whose primary sources contain:",
	options=[f"web \| {w}" for w in options["primary_taxonomy"]["website"]]
	+ options["primary_taxonomy"]["collection"],
	)
	if "media type" in filter_by:
	filter_dict["media"] = {}
	filter_dict["media"]["category"] = st.multiselect(
	key="viz_media_category",
	label="Select language data resources that contain:",
	options=["text", "audiovisual", "image"],
	help="Media data provided with transcription should go into text, then select the transcribed option. PDFs that have pre-extracted text information should go into text, PDFs that need OCR should go into images, select the latter if you're unsure",
	)
	filtered_catalogue = [
	entry
	for entry in catalogue
	if filter_entry(entry, filter_dict) and not (entry["uid"] == "")
	]
	st.markdown(
	f"##### Your query matched {len(filtered_catalogue)} entries in the current catalogue."
	)
	return filtered_catalogue


	def viz_page():
	st.title("🌸 - BigScience Catalog of Language Resources")
	st.markdown("---\n")
	catalogue = load_dataset("bigscience/collaborative_catalog")["train"]
	with st.sidebar:
	filtered_catalogue = filter_catalogue_visualization(catalogue, app_categories)
	entry_location_type = st.radio(
	label="I want to visualize",
	options=[
	"Where the organizations or data custodians are located",
	"Where the language data creators are located",
	],
	key="viz_show_location_type",
	)
	show_by_org = (
	entry_location_type
	== "Where the organizations or data custodians are located"
	)
	with st.expander("Map of entries", expanded=True):
	filtered_counts = {}
	for entry in filtered_catalogue:
	locations = (
	[entry["custodian"]["location"]]
	if show_by_org
	else entry["languages"]["language_locations"]
	)
	# be as specific as possible
	locations = [
	loc
	for loc in locations
	if not any([l in region_tree.get(loc, []) for l in locations])
	]
	for loc in locations:
	filtered_counts[loc] = filtered_counts.get(loc, 0) + 1
	world_map = make_choro_map(filtered_counts)
	folium_static(world_map, width=900, height=600)
	with st.expander("View selected resources", expanded=False):
	st.write("You can further select locations to select entries from here:")
	filter_region_choices = sorted(
	set(
	[
	loc
	for entry in filtered_catalogue
	for loc in (
	[entry["custodian"]["location"]]
	if show_by_org
	else entry["languages"]["language_locations"]
	)
	]
	)
	)
	filter_locs = st.multiselect(
	"View entries from the following locations:",
	options=filter_region_choices,
	key="viz_select_location",
	)
	filter_loc_dict = (
	{"custodian": {"location": filter_locs}}
	if show_by_org
	else {"languages": {"language_locations": filter_locs}}
	)
	filtered_catalogue_by_loc = [
	entry
	for entry in filtered_catalogue
	if filter_entry(entry, filter_loc_dict)
	]
	view_entry = st.selectbox(
	label="Select an entry to see more detail:",
	options=filtered_catalogue_by_loc,
	format_func=lambda entry: f"{entry['uid']} \| {entry['description']['name']} -- {entry['description']['description']}",
	key="viz_select_entry",
	)
	st.markdown(
	f"##### Type: {view_entry['type']} UID: {view_entry['uid']} - Name: {view_entry['description']['name']}\n\n{view_entry['description']['description']}"
	)
	st.write(view_entry)


	if __name__ == "__main__":
	main()