From ec4cda7a97b7e6c43271a5d830de0a2cc364108c Mon Sep 17 00:00:00 2001 From: Matthew Feickert Date: Fri, 15 Mar 2024 19:33:12 -0500 Subject: [PATCH] feat: Add jupytext support for notebooks (#7) * Add jupytext to requirements and rebuild lock files. * Add jupytext .ipynb to .py pairing rules to pyproject.toml. * Convert .ipynb to .py:percent with jupytext. * Ignore .ipynb files to make versioning easier. * Apply ruff formatting. * Add section on using jupytext to README. --- .gitignore | 3 +- README.md | 7 + notebooks/Pre-processing.ipynb | 728 -------------------------------- notebooks/Run-assignments.ipynb | 620 --------------------------- notebooks/pre-processing.py | 415 ++++++++++++++++++ notebooks/run-assignments.py | 331 +++++++++++++++ pyproject.toml | 5 + requirements.lock | 27 +- requirements.txt | 1 + 9 files changed, 787 insertions(+), 1350 deletions(-) delete mode 100644 notebooks/Pre-processing.ipynb delete mode 100644 notebooks/Run-assignments.ipynb create mode 100644 notebooks/pre-processing.py create mode 100644 notebooks/run-assignments.py diff --git a/.gitignore b/.gitignore index 8599432..0a6fdcf 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ __pycache__ *.json venv/ *ipynb_checkpoints -.DS_Store \ No newline at end of file +.DS_Store +*.ipynb diff --git a/README.md b/README.md index 531ed08..b1c852c 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,13 @@ When you're ready to run tests, run: python3 -m pytest ``` +## Using `jupytext` with Jupyter notebooks + +[`jupytext`](https://jupytext.readthedocs.io/) allows for easier versioning of Jupyter notebooks by saving all of the information that exists in them in [specially formatted](https://jupytext.readthedocs.io/en/latest/formats-scripts.html#the-percent-format) `.py` files and then generating the notebook representation when you [select them in a Jupyter interface](https://jupytext.readthedocs.io/en/latest/text-notebooks.html#how-to-open-a-text-notebook-in-jupyter-lab). +Version the `.py` files as you normally would with any other text file. +To run the `.py` files as Jupyter notebooks, select them in the Jupyter file browser, right click, and then select _Open With → Notebook_. +Any changes made in a Jupyter notebook will be automatically synced to the [paired](https://jupytext.readthedocs.io/en/latest/paired-notebooks.html) `.py` file. + # File breakdown Here's a short explanation of each file/folder in this template: diff --git a/notebooks/Pre-processing.ipynb b/notebooks/Pre-processing.ipynb deleted file mode 100644 index 91269fc..0000000 --- a/notebooks/Pre-processing.ipynb +++ /dev/null @@ -1,728 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "2f0334ca", - "metadata": {}, - "outputs": [], - "source": [ - "import duckdb\n", - "\n", - "# Raw data to import\n", - "raw_files = dict(\n", - " scipy_reviewers = \"../data/scipy_reviewers.csv\", # people who signed up as reviewers\n", - " pretalx_sessions = \"../data/sessions.csv\", # all proposal exported from pretalx\n", - " pretalx_speakers = \"../data/speakers.csv\", # all speakers exported from pretalx\n", - " pretalx_reviewers = \"../data/pretalx_reviewers.csv\", # all reviewers copy-pasted from pretalx\n", - " coi_reviewers = \"../data/scipy_coi_export.csv\", # all responses to the coi form\n", - " coi_authors = \"../data/coi_authors.csv\", # copy pasted values of author names from coi form\n", - " tracks = \"../data/tracks.csv\" # manually entered track IDs\n", - ")\n", - "\n", - "# Output\n", - "database_file = \"../data/assign_reviews.db\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "03d0f519", - "metadata": {}, - "outputs": [], - "source": [ - "con = duckdb.connect(database_file)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9ce918a8-7441-4e64-bcf6-a3e6e8f864aa", - "metadata": {}, - "outputs": [], - "source": [ - "def create_and_show_table(file_name, table_name, show=True):\n", - " con.sql(f\"create or replace table {table_name} as select * from read_csv(\\\"{file_name}\\\", header=true)\")\n", - " if show is True:\n", - " return con.sql(f\"table {table_name}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bd7267b9-c1f5-45c1-996b-af93c7b02f08", - "metadata": {}, - "outputs": [], - "source": [ - "from IPython import display\n", - "for table_name, file_name in raw_files.items():\n", - " print(table_name)\n", - " display.display(create_and_show_table(file_name, table_name).df())\n", - " print(\"\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a7570eed-6187-4b40-ac8c-ed25e4beb8cb", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(f\"\"\"\n", - "table tracks\n", - "\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2924fa43-915c-4c63-b715-08e2075983e4", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(f\"\"\"\n", - "with dupes as\n", - " (\n", - " select\n", - " name,\n", - " num,\n", - " email\n", - " from\n", - " (\n", - " select\n", - " name,\n", - " count(*) as num,\n", - " string_agg(Email) as email\n", - " from\n", - " scipy_reviewers\n", - " group by Name\n", - " )\n", - " where\n", - " num>1\n", - " )\n", - "\n", - "select * from dupes\n", - "\"\"\").df()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "04ea463b-4151-4dfe-ae2d-08b1c763e9a0", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(f\"\"\"\n", - "select count(*) from scipy_reviewers\n", - "\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dd8ddb47-daf1-4ac7-abfa-39f7d839be13", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(f\"\"\"\n", - "select count(*) from pretalx_reviewers\n", - "\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11439a50-cfbb-403b-a0fa-cf4b91f31e11", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(f\"\"\"\n", - "select count(*) from coi_reviewers\n", - "\"\"\")" - ] - }, - { - "cell_type": "markdown", - "id": "55e81f6d-c8c9-4cfc-9770-db731068c55f", - "metadata": {}, - "source": [ - "This is a table with all reviewers who\n", - "1. signed up\n", - "2. created an account on pretalx\n", - "3. submitted the COI form" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7b9ecee5-ae65-4c49-8e6c-4ebcd6a796f2", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(f\"\"\"\n", - "create or replace table reviewers as\n", - " select\n", - " scipy_reviewers.Name as name,\n", - " scipy_reviewers.Email as email,\n", - " \\\"Track(s) to review for (check all that apply)\\\" as tracks,\n", - " \\\"Mark the speaker(s) or company/organization/affiliation(s) that could pose a conflict of interest\\\" as coi\n", - " from scipy_reviewers\n", - " join pretalx_reviewers on scipy_reviewers.Email = pretalx_reviewers.Email\n", - " join coi_reviewers on coi_reviewers.Email = pretalx_reviewers.Email\n", - "\"\"\")\n", - "\n", - "df = con.sql(\"select distinct * from reviewers\").df()\n", - "num_reviewers = len(df)\n", - "df" - ] - }, - { - "cell_type": "markdown", - "id": "461de254-93e5-4a97-b7d5-3074df212fdb", - "metadata": {}, - "source": [ - "Reviewers who signed up for pretalx but did not fill in COI" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4221ff7c-c824-4517-820c-1f8323d95970", - "metadata": {}, - "outputs": [], - "source": [ - "con = duckdb.connect(database_file)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b70bc5b8-118a-43a4-9b07-2d6dbc59eb1a", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "df = con.sql(\"select * from pretalx_reviewers anti join coi_reviewers on pretalx_reviewers.Email = coi_reviewers.Email\").df()\n", - "num_pretalx_no_coi = len(df)\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "64d329e5-88f2-4107-a33f-59bdf1a98c8d", - "metadata": {}, - "outputs": [], - "source": [ - "# df.to_csv(\"input/signed_up_for_pretalx_no_coi.csv\")" - ] - }, - { - "cell_type": "markdown", - "id": "5c475b2b-ac98-495d-94e8-5135366523e2", - "metadata": {}, - "source": [ - "Reviewers who filled in COI but did not sign up for pretalx" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "868dcd1e-8f4a-4e01-8ad2-0538d8b4557d", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "df = con.sql(\"select * from coi_reviewers anti join pretalx_reviewers on coi_reviewers.Email = pretalx_reviewers.Email\").df()\n", - "num_coi_no_pretalx = len(df)\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c1b6f1c8-e00f-4e79-ad6c-f509591a362a", - "metadata": {}, - "outputs": [], - "source": [ - "# df.to_csv(\"input/submitted_coi_no_pretalx.csv\")" - ] - }, - { - "cell_type": "markdown", - "id": "c493bc49-adbc-4173-a9c0-38178c7e8135", - "metadata": {}, - "source": [ - "People who signed up as reviewer" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "edf7bf56-7884-49fa-811c-0e5b3c3c5e92", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "df = con.sql(\"\"\"\n", - "select distinct * from scipy_reviewers\n", - "\"\"\").df()\n", - "num_signed_up = len(df)\n", - "df" - ] - }, - { - "cell_type": "markdown", - "id": "0025d41a-ecf6-4c08-985f-3e30807675e3", - "metadata": {}, - "source": [ - "People who signed up as reviewer and signed up for pretalx and submitted COI but used different email addresses" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4feef06c-985a-485c-b4f1-73d87c2f8f1c", - "metadata": {}, - "outputs": [], - "source": [ - "df = con.sql(\"\"\"\n", - "create or replace table reviewers_with_email_typos as\n", - "(with no_coi as\n", - "(select * from pretalx_reviewers anti join coi_reviewers on pretalx_reviewers.Email = coi_reviewers.Email),\n", - "no_pretalx as\n", - "(select * from coi_reviewers anti join pretalx_reviewers on coi_reviewers.Email = pretalx_reviewers.Email)\n", - "select distinct scipy_reviewers.Name, scipy_reviewers.Email, no_pretalx.Email as no_pretalx_email, no_coi.email as no_coi_email from scipy_reviewers\n", - "join no_coi on no_coi.Name = scipy_reviewers.Name\n", - "join no_pretalx on no_pretalx.Name = no_coi.Name)\n", - "\"\"\")\n", - "df = con.sql(\"table reviewers_with_email_typos\").df()\n", - "num_typos = len(df)\n", - "df" - ] - }, - { - "cell_type": "markdown", - "id": "5a2c83c8-29e2-412e-b84e-ab4944e650c4", - "metadata": {}, - "source": [ - "People who signed up as reviewer and signed up for pretalx and submitted COI but used different names" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cf854063-4a88-4be2-8cd9-04fd78f1cf9f", - "metadata": {}, - "outputs": [], - "source": [ - "df = con.sql(\"\"\"\n", - "(with no_coi as\n", - "(select * from pretalx_reviewers anti join coi_reviewers on pretalx_reviewers.Email = coi_reviewers.Email),\n", - "no_pretalx as\n", - "(select * from coi_reviewers anti join pretalx_reviewers on coi_reviewers.Email = pretalx_reviewers.Email)\n", - "select distinct scipy_reviewers.Name, scipy_reviewers.Email, no_pretalx.Name as no_pretalx_name, no_coi.name as no_coi_name from scipy_reviewers\n", - "join no_coi on no_coi.Email = scipy_reviewers.Email\n", - "join no_pretalx on no_pretalx.Email = no_coi.Email)\n", - "\"\"\").df()\n", - "num_typos_name = len(df)\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eb7b392a", - "metadata": {}, - "outputs": [], - "source": [ - "# df.to_csv(\"input/reviewers_multi_email.csv\")" - ] - }, - { - "cell_type": "markdown", - "id": "4e7c6208-0e73-4447-b749-578b7f84f5a9", - "metadata": {}, - "source": [ - "People who signed up as reviewer and didn't sign up for pretalx nor submitted COI" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d7bf3576-78e6-4367-abc0-f48e250807e4", - "metadata": {}, - "outputs": [], - "source": [ - "df = con.sql(\"\"\"\n", - "(with no_coi as\n", - "(select * from pretalx_reviewers anti join coi_reviewers on pretalx_reviewers.Email = coi_reviewers.Email),\n", - "no_pretalx as\n", - "(select * from coi_reviewers anti join pretalx_reviewers on coi_reviewers.Email = pretalx_reviewers.Email)\n", - "select distinct scipy_reviewers.Name, scipy_reviewers.Email from scipy_reviewers\n", - "anti join reviewers on reviewers.Name = scipy_reviewers.Name\n", - "anti join no_coi on no_coi.Name = scipy_reviewers.Name\n", - "anti join no_pretalx on no_pretalx.Name = scipy_reviewers.Name)\n", - "\"\"\").df()\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "29faf2df-6c56-447a-9303-30e297ad4ca7", - "metadata": {}, - "outputs": [], - "source": [ - "df = con.sql(\"\"\"\n", - "select distinct * from scipy_reviewers\n", - "anti join reviewers on scipy_reviewers.Email = reviewers.email\n", - "\"\"\").df()\n", - "num_no_show = len(df)\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ad4b6bb5-2698-4246-afd0-2003c9ac6bda", - "metadata": {}, - "outputs": [], - "source": [ - "# df.to_csv(\"input/all_reviewers_without_assignments.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b4e2ebd4-1c7c-47b2-88a5-f12f52e3b132", - "metadata": {}, - "outputs": [], - "source": [ - "num_no_show = num_signed_up-num_reviewers-num_pretalx_no_coi-num_coi_no_pretalx\n", - "num_partial = sum([num_pretalx_no_coi, num_coi_no_pretalx, num_no_show])\n", - "num_reviewers, num_signed_up, num_pretalx_no_coi, num_coi_no_pretalx, num_no_show, num_partial" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "af12d40a-1f28-4154-9ded-1f358d1d3e06", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(\"select * from reviewers where instr(name, 'eli')\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "223b6992-1f77-44f7-83df-8fae78b30d29", - "metadata": {}, - "outputs": [], - "source": [ - "# con.sql(\"table reviewers\").df().to_csv(\"input/reviewers_to_assign_with_name.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dbb586ee-df00-4fb9-9d16-e3e5d570c3de", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(\"select * from reviewers where instr(Name, 'Wu')\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d82591ec-8532-4b0b-bcf5-ebbc7275c266", - "metadata": {}, - "outputs": [], - "source": [ - "sum([num_pretalx_no_coi, num_coi_no_pretalx, num_reviewers])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e1b3cbed-547b-49b8-80e6-4ab0df57b08f", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(f\"\"\"\n", - "with dupes as\n", - " (\n", - " select\n", - " *\n", - " from\n", - " (\n", - " select\n", - " name,\n", - " count(*) as num,\n", - " string_agg(email) as email,\n", - " string_agg(tracks) as tracks,\n", - " string_agg(coi) as coi\n", - " from\n", - " reviewers\n", - " group by name\n", - " )\n", - " where\n", - " num>1\n", - " )\n", - "\n", - "select * from dupes\n", - "\"\"\").df().T.to_json()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fa1a29e2-b27c-4ed4-a075-4c726e6dda48", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(\"create or replace table reviewers as (select distinct * from reviewers)\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a548f2e-6b79-4c28-8b57-1efae53694ec", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(f\"\"\"\n", - "create or replace table reviewers_with_tracks as\n", - "with reviewers_no_dupes as (select distinct * from reviewers)\n", - "select reviewers_no_dupes.name, email, list(tracks.name) as tracks, list(tracks.track_id) as track_ids from reviewers_no_dupes\n", - " join tracks on instr(reviewers_no_dupes.tracks, tracks.name)\n", - " group by reviewers_no_dupes.name, email\n", - "\"\"\"\n", - ")\n", - "\n", - "con.sql(\"select distinct * from reviewers_with_tracks\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3427dba1-d840-46a2-a304-7a643c1aeee0", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(\"select ID as submission_id, \\\"Speaker IDs\\\" as speaker_ids from pretalx_sessions\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0657b537-bd53-426a-9345-ceca87f36a73", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(f\"\"\"\n", - "create or replace table reviewers_with_coi as\n", - "\n", - "with submissions_with_authors as (\n", - " select\n", - " ID as submission_id,\n", - " \\\"Speaker IDs\\\" as speaker_ids\n", - " from\n", - " pretalx_sessions\n", - ")\n", - "select\n", - " reviewers.name,\n", - " reviewers.email,\n", - " list(pretalx_speakers.Name) as speakers,\n", - " list(pretalx_speakers.ID) AS speaker_ids,\n", - " list(submissions_with_authors.submission_id) as submission_ids\n", - "from\n", - " reviewers\n", - " left join coi_authors on instr(coi, coi_authors.author)\n", - " left join pretalx_speakers on contains(coi_authors.author, pretalx_speakers.Name)\n", - " left join submissions_with_authors on contains(submissions_with_authors.speaker_ids, pretalx_speakers.ID)\n", - "group by reviewers.name, reviewers.email\n", - "order by reviewers.name\n", - "\"\"\"\n", - ")\n", - "\n", - "con.sql(\"table reviewers_with_coi\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ffef4114-385b-44b0-9bc1-f21a8a1402ed", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(\"\"\"\n", - "with reviewers_with_coi_pre as (\n", - " select name, email, author\n", - " from reviewers\n", - " join coi_authors on instr(coi, coi_authors.author)\n", - ")\n", - "select count(*), author from reviewers_with_coi_pre anti join pretalx_speakers on contains(reviewers_with_coi_pre.author, pretalx_speakers.Name) group by author\n", - "\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "acdd01c4-eb7b-4f80-a8dc-462a9bc5508b", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(\"table reviewers_with_tracks\").df()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "90185548-bfdd-47d1-bdac-d95af43b8c0f", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(\"select email as reviewer_id, list(track_id) as tracks from reviewers_with_tracks group by email\")" - ] - }, - { - "cell_type": "markdown", - "id": "adeb8dcc-614c-430e-a65f-932ee13a881c", - "metadata": {}, - "source": [ - "# Final tables for script" - ] - }, - { - "cell_type": "markdown", - "id": "cac7157c-c2eb-4f2e-bd02-cc2fe788afc4", - "metadata": {}, - "source": [ - "## reviewers_to_assign" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "183fabc0-a5fa-446d-9375-381cd75ff7c7", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(\"\"\"\n", - "create or replace table reviewers_to_assign as\n", - "select\n", - " reviewers_with_coi.email as reviewer_id,\n", - " reviewers_with_tracks.track_ids as tracks,\n", - " reviewers_with_coi.submission_ids as conflicts_submission_ids\n", - "from reviewers_with_coi\n", - "join reviewers_with_tracks on reviewers_with_tracks.email = reviewers_with_coi.email\n", - "\"\"\")\n", - "\n", - "con.sql(\"table reviewers_to_assign\").df()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4c1facf8-046c-49e0-b21b-67c2d7b5dc64", - "metadata": {}, - "outputs": [], - "source": [ - "# con.sql(\"table reviewers_to_assign\").df().to_csv(\"input/reviewers_to_assign.csv\")" - ] - }, - { - "cell_type": "markdown", - "id": "5f717e8c-9b97-4666-a2c7-80ab491d61af", - "metadata": {}, - "source": [ - "## submissions_to_assign" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "34f07bb1-101e-45f1-8598-e6a459393de9", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(\"\"\"\n", - "create or replace table submissions_to_assign as\n", - "select\n", - " ID as submission_id,\n", - " string_split(\\\"Speaker IDs\\\", '\\n') as author_ids,\n", - " track_id as track\n", - "from pretalx_sessions\n", - " join tracks on pretalx_sessions.Track = tracks.name\n", - "\"\"\")\n", - "\n", - "con.sql(\"table submissions_to_assign\").df()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59c00158-ce1b-4f58-9cff-dbb34a5cd8af", - "metadata": {}, - "outputs": [], - "source": [ - "# con.sql(\"table submissions_to_assign\").df().to_csv(\"input/submissions_to_assign.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d2c113b9-e32f-40c9-ab97-df975d12f741", - "metadata": {}, - "outputs": [], - "source": [ - "# con.sql(\"table submissions_to_assign\").df().author_ids.iloc[1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a5c7a0fd-b18e-4f57-b0ef-23fdd06531ff", - "metadata": {}, - "outputs": [], - "source": [ - "con.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e6dd57d1-49cb-42e3-a774-81b2d2dca2a0", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/Run-assignments.ipynb b/notebooks/Run-assignments.ipynb deleted file mode 100644 index 59e2b88..0000000 --- a/notebooks/Run-assignments.ipynb +++ /dev/null @@ -1,620 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "bbfa2c54-c1f1-4241-bc27-91fc1b1265bf", - "metadata": {}, - "outputs": [], - "source": [ - "# %%\n", - "####################\n", - "## ASSIGN REVIEWS ##\n", - "####################\n", - "# Imports\n", - "import numpy as np\n", - "import pandas as pd\n", - "import duckdb\n", - "\n", - "import sys\n", - "sys.path.append(\"..\")\n", - "from assign_reviews import create_objective_fun, create_lb_ub, create_constraints, solve_milp, format_and_output_result" - ] - }, - { - "cell_type": "markdown", - "id": "ad38296b-0b09-4c67-bff3-37943db918b1", - "metadata": {}, - "source": [ - "# Start script" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1b72db3b-80e4-41b5-992c-e01edc0c223e", - "metadata": {}, - "outputs": [], - "source": [ - "mkdir output" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1e0ec716-20a4-470b-9aea-5dc5853be804", - "metadata": {}, - "outputs": [], - "source": [ - "ASSIGN_TUTORIALS_TO_ANYONE = False\n", - "TUTORIAL_COEFF = 0.8\n", - "\n", - "DEBUG = True\n", - "\n", - "database_file = \"../data/assign_reviews.db\"\n", - "con = duckdb.connect(database_file)\n", - "df_submissions = con.sql(\"table submissions_to_assign\").df()\n", - "df_reviewers = con.sql(\"table reviewers_to_assign\").df()\n", - "\n", - "df_submissions = df_submissions.assign(assigned_reviewer_ids=[[]] * len(df_submissions))\n", - "df_reviewers = df_reviewers.assign(assigned_submission_ids=[[]] * len(df_reviewers))\n", - "\n", - "len(df_submissions), len(df_reviewers)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ef711304-a232-4b5f-a5f2-08ae410c8d62", - "metadata": {}, - "outputs": [], - "source": [ - "df_submissions[df_submissions.track==\"TUT\"]" - ] - }, - { - "cell_type": "markdown", - "id": "34384f90-6f5a-42b6-b6d6-b27c756ee9b8", - "metadata": {}, - "source": [ - "## Step 1. Assign tutorial reviewers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e6e46cab-603a-4c28-8d75-bcbf982cdb56", - "metadata": {}, - "outputs": [], - "source": [ - "MIN_TUTORIALS_PER_PERSON = 0\n", - "MAX_TUTORIALS_PER_PERSON = 5\n", - "MIN_REVIEWERS_PER_TUTORIAL = 3\n", - "MAX_REVIEWERS_PER_TUTORIAL = 4\n", - "\n", - "df_submissions_tutorials = df_submissions[df_submissions.track==\"TUT\"]\n", - "\n", - "solution = solve_milp(\n", - " df_reviewers,\n", - " df_submissions_tutorials,\n", - " MIN_TUTORIALS_PER_PERSON,\n", - " MAX_TUTORIALS_PER_PERSON,\n", - " MIN_REVIEWERS_PER_TUTORIAL,\n", - " MAX_REVIEWERS_PER_TUTORIAL,\n", - " TUTORIAL_COEFF,\n", - " ASSIGN_TUTORIALS_TO_ANYONE\n", - ")\n", - "reviewers, submissions = format_and_output_result(df_reviewers, df_submissions_tutorials, solution, post_fix=\"00\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7109479e-3e56-4ef0-9f17-1be5e9d6f7f9", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.DataFrame(reviewers)\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "029c405c-48fb-4598-896e-e0227e9b6987", - "metadata": {}, - "outputs": [], - "source": [ - "df_reviewers_with_tut = df_reviewers.assign(assigned_submission_ids=df.assigned_submission_ids)\n", - "df_reviewers_with_tut" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9ee582a7-e840-4273-ae03-aa88d1912157", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(\"select * from df_reviewers_with_tut\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9c3195a4-2baf-4dd0-a7b3-62ce193e00fc", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(\"create or replace table reviewer_assignments_00 as select * from df_reviewers_with_tut\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f2fcddf-8573-4ce5-9e33-ab3c2ff73cc5", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.DataFrame(submissions)\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "03146222-8257-446e-820c-89710391d08d", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(\"\"\"\n", - "create or replace table submission_assignments_00 as\n", - "select df_submissions.submission_id, df_submissions.author_ids, df_submissions.track,\n", - "list_concat(df_submissions.assigned_reviewer_ids, df.assigned_reviewer_ids) as assigned_reviewer_ids\n", - "from df_submissions\n", - "left join df on df.submission_id = df_submissions.submission_id\n", - "\"\"\")\n", - "con.sql(\"table submission_assignments_00\")" - ] - }, - { - "cell_type": "markdown", - "id": "d303b378-63ff-405c-aaf1-4af51f09bfea", - "metadata": {}, - "source": [ - "## Step 2. Assign talk reviewers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e9695336-6696-4dc3-86eb-9d0eea82bcea", - "metadata": {}, - "outputs": [], - "source": [ - "df_reviewers_with_tut[df_reviewers_with_tut.assigned_submission_ids.apply(len) == 0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0174c6e7-0feb-4073-9ee7-0f994439dd2c", - "metadata": {}, - "outputs": [], - "source": [ - "MIN_REVIEWS_PER_PERSON = 5\n", - "MAX_REVIEWS_PER_PERSON = 9\n", - "MIN_REVIEWERS_PER_SUBMISSION = 2\n", - "MAX_REVIEWERS_PER_SUBMISSION = 4\n", - "\n", - "df_reviewers_no_submissions = df_reviewers_with_tut[df_reviewers_with_tut.assigned_submission_ids.apply(len) == 0]\n", - "df_submissions_no_tutorials = df_submissions[df_submissions.track!=\"TUT\"]\n", - "\n", - "solution = solve_milp(\n", - " df_reviewers_no_submissions,\n", - " df_submissions_no_tutorials,\n", - " MIN_REVIEWS_PER_PERSON,\n", - " MAX_REVIEWS_PER_PERSON,\n", - " MIN_REVIEWERS_PER_SUBMISSION,\n", - " MAX_REVIEWERS_PER_SUBMISSION,\n", - " TUTORIAL_COEFF,\n", - " ASSIGN_TUTORIALS_TO_ANYONE\n", - ")\n", - "if solution is not None:\n", - " reviewers, submissions = format_and_output_result(df_reviewers_no_submissions, df_submissions_no_tutorials, solution, post_fix=\"01\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "941e4ea6-e3d0-4da5-bb32-9c1bbb6698bf", - "metadata": {}, - "outputs": [], - "source": [ - "df_reviewers_with_tut" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e56630ab-54b5-4c81-a91b-0ee795f6ac35", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.DataFrame(reviewers)[[\"reviewer_id\", \"assigned_submission_ids\"]]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aa3bc6b2-00a1-4c63-b8c4-816bce8ecead", - "metadata": {}, - "outputs": [], - "source": [ - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d4a8cf52-4eba-4231-a779-fcfcd4ae3387", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(\"\"\"\n", - "create or replace table reviewer_assignments_01 as\n", - "select\n", - " df_reviewers_with_tut.reviewer_id, tracks, conflicts_submission_ids,\n", - " list_concat(df_reviewers_with_tut.assigned_submission_ids, df.assigned_submission_ids) as assigned_submission_ids\n", - "from df_reviewers_with_tut\n", - "left join df on df.reviewer_id = df_reviewers_with_tut.reviewer_id\n", - "\"\"\")\n", - "con.sql(\"table reviewer_assignments_01\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "656b2edc-0374-4c66-b7aa-19fb315c73d1", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.DataFrame(submissions)\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9a1716c3-d879-4d91-9aae-69601f372f77", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(\"\"\"\n", - "create or replace table submission_assignments_01 as\n", - "select submission_assignments_00.submission_id, submission_assignments_00.author_ids, submission_assignments_00.track,\n", - "list_concat(submission_assignments_00.assigned_reviewer_ids, df.assigned_reviewer_ids) as assigned_reviewer_ids\n", - "from submission_assignments_00\n", - "left join df on df.submission_id = submission_assignments_00.submission_id\n", - "\"\"\")\n", - "con.sql(\"table submission_assignments_01\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d2cc2801-c6c5-417a-9f5d-cf60ee8d3dc7", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.DataFrame(submissions)\n", - "df = df.assign(num_reviewers=df.assigned_reviewer_ids.apply(len))\n", - "df[df.num_reviewers>2]\n", - "df[df.num_reviewers==2]" - ] - }, - { - "cell_type": "markdown", - "id": "b7c3c63e-908b-474a-ab02-02873b89b942", - "metadata": {}, - "source": [ - "## Step 3. Assign talks to tutorial reviewers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b1e3d00e-f3bd-4166-b00d-10c9c5c4c2ec", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.DataFrame(submissions)\n", - "df = df.assign(num_reviewers=df.assigned_reviewer_ids.apply(len))\n", - "df_submissions_few_reviewers = df[df.num_reviewers==2]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2afdf9a9-b4bd-4cbf-b8ec-9148236cc53f", - "metadata": {}, - "outputs": [], - "source": [ - "MIN_REVIEWS_PER_PERSON = 0\n", - "MAX_REVIEWS_PER_PERSON = 4\n", - "MIN_REVIEWERS_PER_SUBMISSION = 1\n", - "MAX_REVIEWERS_PER_SUBMISSION = 2\n", - "\n", - "df_reviewers_only_tut = df_reviewers_with_tut[df_reviewers_with_tut.assigned_submission_ids.apply(len) > 0]\n", - "\n", - "solution = solve_milp(\n", - " df_reviewers_only_tut,\n", - " df_submissions_few_reviewers,\n", - " MIN_REVIEWS_PER_PERSON,\n", - " MAX_REVIEWS_PER_PERSON,\n", - " MIN_REVIEWERS_PER_SUBMISSION,\n", - " MAX_REVIEWERS_PER_SUBMISSION,\n", - " TUTORIAL_COEFF,\n", - " ASSIGN_TUTORIALS_TO_ANYONE\n", - ")\n", - "\n", - "if solution is not None:\n", - " reviewers, submissions = format_and_output_result(df_reviewers_only_tut, df_submissions_few_reviewers, solution, post_fix=\"02\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8ddc69f0-6408-4d4a-91eb-7e1d967aa6ab", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.DataFrame(submissions)\n", - "df = df.assign(num_reviewers=df.assigned_reviewer_ids.apply(len))\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e6dcb32f-b32a-42ea-b36c-49674f39ecb2", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.DataFrame(reviewers)\n", - "df = df[[\"reviewer_id\", \"assigned_submission_ids\"]]\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "132eb314-268d-4778-bce1-dda40cbdd2a3", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(\"\"\"\n", - "create or replace table reviewer_assignments_02 as\n", - "select\n", - " reviewer_assignments_01.reviewer_id, tracks, conflicts_submission_ids,\n", - " list_concat(reviewer_assignments_01.assigned_submission_ids, df.assigned_submission_ids) as assigned_submission_ids\n", - "from reviewer_assignments_01\n", - "left join df on df.reviewer_id = reviewer_assignments_01.reviewer_id\n", - "\"\"\")\n", - "con.sql(\"table reviewer_assignments_02\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a5397c3f-d63f-493e-908a-47e857e09f57", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(\"select count(*), string_agg(reviewer_id), len(assigned_submission_ids) as num_submissions from reviewer_assignments_02 group by num_submissions\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2e57c7c6-d2f4-4742-83cb-f876f01574ce", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.DataFrame(submissions)\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12ecbfa5-60bc-4c13-b209-cb686aa04c7f", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(\"\"\"\n", - "create or replace table submission_assignments_02 as\n", - "select submission_assignments_01.submission_id, submission_assignments_01.author_ids, submission_assignments_01.track,\n", - "list_concat(submission_assignments_01.assigned_reviewer_ids, df.assigned_reviewer_ids) as assigned_reviewer_ids\n", - "from submission_assignments_01\n", - "left join df on df.submission_id = submission_assignments_01.submission_id\n", - "\"\"\")\n", - "con.sql(\"table submission_assignments_02\")" - ] - }, - { - "cell_type": "markdown", - "id": "f8186a06-981a-4f88-bde7-be4479c57697", - "metadata": {}, - "source": [ - "## Final counts/checks" - ] - }, - { - "cell_type": "markdown", - "id": "135ef2a0-69ab-4b76-afc4-41e898ee9083", - "metadata": {}, - "source": [ - "All submissions have at least 3 reviewers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d8719243-645d-40d1-97ee-36e21f6cfdc9", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(\"\"\"\n", - "select string_agg(submission_id), count(track), len(assigned_reviewer_ids) from submission_assignments_02 group by len(assigned_reviewer_ids)\n", - "\"\"\")" - ] - }, - { - "cell_type": "markdown", - "id": "de1edcf3-2866-422b-ade4-b9c4caa5bbfd", - "metadata": {}, - "source": [ - "Step 1: Only tutorial assignments" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2e3747df-88a0-4ebd-9f3c-4f0d73c4d9ce", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(\"\"\"\n", - "select string_agg(reviewer_id), count(reviewer_id), string_agg(tracks), len(assigned_submission_ids) from reviewer_assignments_00 group by len(assigned_submission_ids)\n", - "\"\"\")" - ] - }, - { - "cell_type": "markdown", - "id": "4ff68efd-6ef6-44c0-ae5e-16d8e22e230c", - "metadata": {}, - "source": [ - "Step 2: Add talks assignments" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "40ec3a7e-e444-4890-b03b-54abdc90a13d", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(\"\"\"\n", - "select string_agg(reviewer_id), count(reviewer_id), string_agg(tracks), len(assigned_submission_ids) from reviewer_assignments_01 group by len(assigned_submission_ids)\n", - "\"\"\")" - ] - }, - { - "cell_type": "markdown", - "id": "7be6ab3a-21d5-46f5-bcb1-5d1ed79573f7", - "metadata": {}, - "source": [ - "Step 3: Assign talks to tutorial reviewers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "acea73f1-c7b1-4d32-8cca-2ab37ef6ab67", - "metadata": {}, - "outputs": [], - "source": [ - "con.sql(\"\"\"\n", - "select string_agg(reviewer_id), count(reviewer_id), string_agg(tracks), len(assigned_submission_ids) from reviewer_assignments_02 group by len(assigned_submission_ids)\n", - "\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aa5491c1-7086-4b54-ac34-8dd541f94767", - "metadata": {}, - "outputs": [], - "source": [ - "con.close()" - ] - }, - { - "cell_type": "markdown", - "id": "f993a8fb-25e1-4250-bbd6-8e67cf662659", - "metadata": {}, - "source": [ - "## Final export" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e67db9e7-5ca8-40e9-b90b-1a4dd97fb05d", - "metadata": {}, - "outputs": [], - "source": [ - "import duckdb\n", - "database_file = \"../data/assign_reviews.db\"\n", - "con = duckdb.connect(database_file)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "585bbb58-3582-41b9-b805-b9a607508f5e", - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "reviewer_assignments_final = {\n", - " item[\"reviewer_id\"]: item[\"assigned_submission_ids\"]\n", - " for item in\n", - " con.sql(\"table reviewer_assignments_02\").df()[[\"reviewer_id\", \"assigned_submission_ids\"]].to_dict(\"records\")\n", - "}\n", - "with open(f\"output/reviewer-assignments.json\", \"w\") as fp:\n", - " fp.write(json.dumps(reviewer_assignments_final, indent=4))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c502cf9b-f573-4565-bc59-b98e0c0070b1", - "metadata": {}, - "outputs": [], - "source": [ - "con.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4f88e36e-9b7e-4c4e-a43d-b7248a2f0f1e", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/pre-processing.py b/notebooks/pre-processing.py new file mode 100644 index 0000000..9f6c49f --- /dev/null +++ b/notebooks/pre-processing.py @@ -0,0 +1,415 @@ +# --- +# jupyter: +# jupytext: +# notebook_metadata_filter: all,-jupytext.text_representation.jupytext_version +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# language_info: +# codemirror_mode: +# name: ipython +# version: 3 +# file_extension: .py +# mimetype: text/x-python +# name: python +# nbconvert_exporter: python +# pygments_lexer: ipython3 +# version: 3.12.1 +# --- + +# %% +import duckdb +from IPython import display + +# Raw data to import +raw_files = dict( + scipy_reviewers="../data/scipy_reviewers.csv", # people who signed up as reviewers + pretalx_sessions="../data/sessions.csv", # all proposal exported from pretalx + pretalx_speakers="../data/speakers.csv", # all speakers exported from pretalx + pretalx_reviewers="../data/pretalx_reviewers.csv", # all reviewers copy-pasted from pretalx + coi_reviewers="../data/scipy_coi_export.csv", # all responses to the coi form + coi_authors="../data/coi_authors.csv", # copy pasted values of author names from coi form + tracks="../data/tracks.csv", # manually entered track IDs +) + +# Output +database_file = "../data/assign_reviews.db" + +# %% +con = duckdb.connect(database_file) + + +# %% +def create_and_show_table(file_name, table_name, show=True): + con.sql(f'create or replace table {table_name} as select * from read_csv("{file_name}", header=true)') + if show is True: + return con.sql(f"table {table_name}") + + +# %% +for table_name, file_name in raw_files.items(): + print(table_name) + display.display(create_and_show_table(file_name, table_name).df()) + print("\n") + +# %% +con.sql( + """ +table tracks +""" +) + +# %% +con.sql( + """ +with dupes as + ( + select + name, + num, + email + from + ( + select + name, + count(*) as num, + string_agg(Email) as email + from + scipy_reviewers + group by Name + ) + where + num>1 + ) + +select * from dupes +""" +).df() + +# %% +con.sql( + """ +select count(*) from scipy_reviewers +""" +) + +# %% +con.sql( + """ +select count(*) from pretalx_reviewers +""" +) + +# %% +con.sql( + """ +select count(*) from coi_reviewers +""" +) + +# %% [markdown] +# This is a table with all reviewers who +# 1. signed up +# 2. created an account on pretalx +# 3. submitted the COI form + +# %% +con.sql( + """ +create or replace table reviewers as + select + scipy_reviewers.Name as name, + scipy_reviewers.Email as email, + \"Track(s) to review for (check all that apply)\" as tracks, + \"Mark the speaker(s) or company/organization/affiliation(s) that could pose a conflict of interest\" as coi + from scipy_reviewers + join pretalx_reviewers on scipy_reviewers.Email = pretalx_reviewers.Email + join coi_reviewers on coi_reviewers.Email = pretalx_reviewers.Email +""" +) + +df = con.sql("select distinct * from reviewers").df() +num_reviewers = len(df) +df + +# %% [markdown] +# Reviewers who signed up for pretalx but did not fill in COI + +# %% +con = duckdb.connect(database_file) + +# %% +df = con.sql( + "select * from pretalx_reviewers anti join coi_reviewers on pretalx_reviewers.Email = coi_reviewers.Email" +).df() +num_pretalx_no_coi = len(df) +df + +# %% +# df.to_csv("input/signed_up_for_pretalx_no_coi.csv") + +# %% [markdown] +# Reviewers who filled in COI but did not sign up for pretalx + +# %% +df = con.sql( + "select * from coi_reviewers anti join pretalx_reviewers on coi_reviewers.Email = pretalx_reviewers.Email" +).df() +num_coi_no_pretalx = len(df) +df + +# %% +# df.to_csv("input/submitted_coi_no_pretalx.csv") + +# %% [markdown] +# People who signed up as reviewer + +# %% +df = con.sql( + """ +select distinct * from scipy_reviewers +""" +).df() +num_signed_up = len(df) +df + +# %% [markdown] +# People who signed up as reviewer and signed up for pretalx and submitted COI but used different email addresses + +# %% +df = con.sql( + """ +create or replace table reviewers_with_email_typos as +(with no_coi as +(select * from pretalx_reviewers anti join coi_reviewers on pretalx_reviewers.Email = coi_reviewers.Email), +no_pretalx as +(select * from coi_reviewers anti join pretalx_reviewers on coi_reviewers.Email = pretalx_reviewers.Email) +select distinct scipy_reviewers.Name, scipy_reviewers.Email, no_pretalx.Email as no_pretalx_email, no_coi.email as no_coi_email from scipy_reviewers +join no_coi on no_coi.Name = scipy_reviewers.Name +join no_pretalx on no_pretalx.Name = no_coi.Name) +""" # noqa: E501 +) +df = con.sql("table reviewers_with_email_typos").df() +num_typos = len(df) +df + +# %% [markdown] +# People who signed up as reviewer and signed up for pretalx and submitted COI but used different names + +# %% +df = con.sql( + """ +(with no_coi as +(select * from pretalx_reviewers anti join coi_reviewers on pretalx_reviewers.Email = coi_reviewers.Email), +no_pretalx as +(select * from coi_reviewers anti join pretalx_reviewers on coi_reviewers.Email = pretalx_reviewers.Email) +select distinct scipy_reviewers.Name, scipy_reviewers.Email, no_pretalx.Name as no_pretalx_name, no_coi.name as no_coi_name from scipy_reviewers +join no_coi on no_coi.Email = scipy_reviewers.Email +join no_pretalx on no_pretalx.Email = no_coi.Email) +""" # noqa: E501 +).df() +num_typos_name = len(df) +df + +# %% +# df.to_csv("input/reviewers_multi_email.csv") + +# %% [markdown] +# People who signed up as reviewer and didn't sign up for pretalx nor submitted COI + +# %% +df = con.sql( + """ +(with no_coi as +(select * from pretalx_reviewers anti join coi_reviewers on pretalx_reviewers.Email = coi_reviewers.Email), +no_pretalx as +(select * from coi_reviewers anti join pretalx_reviewers on coi_reviewers.Email = pretalx_reviewers.Email) +select distinct scipy_reviewers.Name, scipy_reviewers.Email from scipy_reviewers +anti join reviewers on reviewers.Name = scipy_reviewers.Name +anti join no_coi on no_coi.Name = scipy_reviewers.Name +anti join no_pretalx on no_pretalx.Name = scipy_reviewers.Name) +""" +).df() +df + +# %% +df = con.sql( + """ +select distinct * from scipy_reviewers +anti join reviewers on scipy_reviewers.Email = reviewers.email +""" +).df() +num_no_show = len(df) +df + +# %% +# df.to_csv("input/all_reviewers_without_assignments.csv") + +# %% +num_no_show = num_signed_up - num_reviewers - num_pretalx_no_coi - num_coi_no_pretalx +num_partial = sum([num_pretalx_no_coi, num_coi_no_pretalx, num_no_show]) +num_reviewers, num_signed_up, num_pretalx_no_coi, num_coi_no_pretalx, num_no_show, num_partial + +# %% +con.sql("select * from reviewers where instr(name, 'eli')") + +# %% +# con.sql("table reviewers").df().to_csv("input/reviewers_to_assign_with_name.csv") + +# %% +con.sql("select * from reviewers where instr(Name, 'Wu')") + +# %% +sum([num_pretalx_no_coi, num_coi_no_pretalx, num_reviewers]) + +# %% +con.sql( + """ +with dupes as + ( + select + * + from + ( + select + name, + count(*) as num, + string_agg(email) as email, + string_agg(tracks) as tracks, + string_agg(coi) as coi + from + reviewers + group by name + ) + where + num>1 + ) + +select * from dupes +""" +).df().T.to_json() + +# %% +con.sql("create or replace table reviewers as (select distinct * from reviewers)") + +# %% +con.sql( + """ +create or replace table reviewers_with_tracks as +with reviewers_no_dupes as (select distinct * from reviewers) +select reviewers_no_dupes.name, email, list(tracks.name) as tracks, list(tracks.track_id) as track_ids from reviewers_no_dupes + join tracks on instr(reviewers_no_dupes.tracks, tracks.name) + group by reviewers_no_dupes.name, email +""" # noqa: E501 +) + +con.sql("select distinct * from reviewers_with_tracks") + +# %% +con.sql('select ID as submission_id, "Speaker IDs" as speaker_ids from pretalx_sessions') + +# %% +con.sql( + """ +create or replace table reviewers_with_coi as + +with submissions_with_authors as ( + select + ID as submission_id, + \"Speaker IDs\" as speaker_ids + from + pretalx_sessions +) +select + reviewers.name, + reviewers.email, + list(pretalx_speakers.Name) as speakers, + list(pretalx_speakers.ID) AS speaker_ids, + list(submissions_with_authors.submission_id) as submission_ids +from + reviewers + left join coi_authors on instr(coi, coi_authors.author) + left join pretalx_speakers on contains(coi_authors.author, pretalx_speakers.Name) + left join submissions_with_authors on contains(submissions_with_authors.speaker_ids, pretalx_speakers.ID) +group by reviewers.name, reviewers.email +order by reviewers.name +""" +) + +con.sql("table reviewers_with_coi") + +# %% +con.sql( + """ +with reviewers_with_coi_pre as ( + select name, email, author + from reviewers + join coi_authors on instr(coi, coi_authors.author) +) +select count(*), author from reviewers_with_coi_pre anti join pretalx_speakers on contains(reviewers_with_coi_pre.author, pretalx_speakers.Name) group by author +""" # noqa: E501 +) + +# %% +con.sql("table reviewers_with_tracks").df() + +# %% +con.sql("select email as reviewer_id, list(track_id) as tracks from reviewers_with_tracks group by email") + +# %% [markdown] +# # Final tables for script + +# %% [markdown] +# ## reviewers_to_assign + +# %% +con.sql( + """ +create or replace table reviewers_to_assign as +select + reviewers_with_coi.email as reviewer_id, + reviewers_with_tracks.track_ids as tracks, + reviewers_with_coi.submission_ids as conflicts_submission_ids +from reviewers_with_coi +join reviewers_with_tracks on reviewers_with_tracks.email = reviewers_with_coi.email +""" +) + +con.sql("table reviewers_to_assign").df() + +# %% +# con.sql("table reviewers_to_assign").df().to_csv("input/reviewers_to_assign.csv") + +# %% [markdown] +# ## submissions_to_assign + +# %% +con.sql( + """ +create or replace table submissions_to_assign as +select + ID as submission_id, + string_split(\"Speaker IDs\", '\n') as author_ids, + track_id as track +from pretalx_sessions + join tracks on pretalx_sessions.Track = tracks.name +""" +) + +con.sql("table submissions_to_assign").df() + +# %% +# con.sql("table submissions_to_assign").df().to_csv("input/submissions_to_assign.csv") + +# %% +# con.sql("table submissions_to_assign").df().author_ids.iloc[1] + +# %% +con.close() + +# %% diff --git a/notebooks/run-assignments.py b/notebooks/run-assignments.py new file mode 100644 index 0000000..a4fc383 --- /dev/null +++ b/notebooks/run-assignments.py @@ -0,0 +1,331 @@ +# --- +# jupyter: +# jupytext: +# notebook_metadata_filter: all,-jupytext.text_representation.jupytext_version +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# language_info: +# codemirror_mode: +# name: ipython +# version: 3 +# file_extension: .py +# mimetype: text/x-python +# name: python +# nbconvert_exporter: python +# pygments_lexer: ipython3 +# version: 3.12.1 +# --- + +# %% +#################### +## ASSIGN REVIEWS ## +#################### +# Imports +import json +import sys + +import duckdb +import pandas as pd + +sys.path.append("..") +from assign_reviews import format_and_output_result, solve_milp + +# %% [markdown] +# # Start script + +# %% +# mkdir output + +# %% +ASSIGN_TUTORIALS_TO_ANYONE = False +TUTORIAL_COEFF = 0.8 + +DEBUG = True + +database_file = "../data/assign_reviews.db" +con = duckdb.connect(database_file) +df_submissions = con.sql("table submissions_to_assign").df() +df_reviewers = con.sql("table reviewers_to_assign").df() + +df_submissions = df_submissions.assign(assigned_reviewer_ids=[[]] * len(df_submissions)) +df_reviewers = df_reviewers.assign(assigned_submission_ids=[[]] * len(df_reviewers)) + +len(df_submissions), len(df_reviewers) + +# %% +df_submissions[df_submissions.track == "TUT"] + +# %% [markdown] +# ## Step 1. Assign tutorial reviewers + +# %% +MIN_TUTORIALS_PER_PERSON = 0 +MAX_TUTORIALS_PER_PERSON = 5 +MIN_REVIEWERS_PER_TUTORIAL = 3 +MAX_REVIEWERS_PER_TUTORIAL = 4 + +df_submissions_tutorials = df_submissions[df_submissions.track == "TUT"] + +solution = solve_milp( + df_reviewers, + df_submissions_tutorials, + MIN_TUTORIALS_PER_PERSON, + MAX_TUTORIALS_PER_PERSON, + MIN_REVIEWERS_PER_TUTORIAL, + MAX_REVIEWERS_PER_TUTORIAL, + TUTORIAL_COEFF, + ASSIGN_TUTORIALS_TO_ANYONE, +) +reviewers, submissions = format_and_output_result(df_reviewers, df_submissions_tutorials, solution, post_fix="00") + +# %% +df = pd.DataFrame(reviewers) +df + +# %% +df_reviewers_with_tut = df_reviewers.assign(assigned_submission_ids=df.assigned_submission_ids) +df_reviewers_with_tut + +# %% +con.sql("select * from df_reviewers_with_tut") + +# %% +con.sql("create or replace table reviewer_assignments_00 as select * from df_reviewers_with_tut") + +# %% +df = pd.DataFrame(submissions) +df + +# %% +con.sql( + """ +create or replace table submission_assignments_00 as +select df_submissions.submission_id, df_submissions.author_ids, df_submissions.track, +list_concat(df_submissions.assigned_reviewer_ids, df.assigned_reviewer_ids) as assigned_reviewer_ids +from df_submissions +left join df on df.submission_id = df_submissions.submission_id +""" +) +con.sql("table submission_assignments_00") + +# %% [markdown] +# ## Step 2. Assign talk reviewers + +# %% +df_reviewers_with_tut[df_reviewers_with_tut.assigned_submission_ids.apply(len) == 0] + +# %% +MIN_REVIEWS_PER_PERSON = 5 +MAX_REVIEWS_PER_PERSON = 9 +MIN_REVIEWERS_PER_SUBMISSION = 2 +MAX_REVIEWERS_PER_SUBMISSION = 4 + +df_reviewers_no_submissions = df_reviewers_with_tut[df_reviewers_with_tut.assigned_submission_ids.apply(len) == 0] +df_submissions_no_tutorials = df_submissions[df_submissions.track != "TUT"] + +solution = solve_milp( + df_reviewers_no_submissions, + df_submissions_no_tutorials, + MIN_REVIEWS_PER_PERSON, + MAX_REVIEWS_PER_PERSON, + MIN_REVIEWERS_PER_SUBMISSION, + MAX_REVIEWERS_PER_SUBMISSION, + TUTORIAL_COEFF, + ASSIGN_TUTORIALS_TO_ANYONE, +) +if solution is not None: + reviewers, submissions = format_and_output_result( + df_reviewers_no_submissions, df_submissions_no_tutorials, solution, post_fix="01" + ) + +# %% +df_reviewers_with_tut + +# %% +df = pd.DataFrame(reviewers)[["reviewer_id", "assigned_submission_ids"]] + +# %% +df + +# %% +con.sql( + """ +create or replace table reviewer_assignments_01 as +select + df_reviewers_with_tut.reviewer_id, tracks, conflicts_submission_ids, + list_concat(df_reviewers_with_tut.assigned_submission_ids, df.assigned_submission_ids) as assigned_submission_ids +from df_reviewers_with_tut +left join df on df.reviewer_id = df_reviewers_with_tut.reviewer_id +""" +) +con.sql("table reviewer_assignments_01") + +# %% +df = pd.DataFrame(submissions) +df + +# %% +con.sql( + """ +create or replace table submission_assignments_01 as +select submission_assignments_00.submission_id, submission_assignments_00.author_ids, submission_assignments_00.track, +list_concat(submission_assignments_00.assigned_reviewer_ids, df.assigned_reviewer_ids) as assigned_reviewer_ids +from submission_assignments_00 +left join df on df.submission_id = submission_assignments_00.submission_id +""" +) +con.sql("table submission_assignments_01") + +# %% +df = pd.DataFrame(submissions) +df = df.assign(num_reviewers=df.assigned_reviewer_ids.apply(len)) +df[df.num_reviewers > 2] +df[df.num_reviewers == 2] + +# %% [markdown] +# ## Step 3. Assign talks to tutorial reviewers + +# %% +df = pd.DataFrame(submissions) +df = df.assign(num_reviewers=df.assigned_reviewer_ids.apply(len)) +df_submissions_few_reviewers = df[df.num_reviewers == 2] + +# %% +MIN_REVIEWS_PER_PERSON = 0 +MAX_REVIEWS_PER_PERSON = 4 +MIN_REVIEWERS_PER_SUBMISSION = 1 +MAX_REVIEWERS_PER_SUBMISSION = 2 + +df_reviewers_only_tut = df_reviewers_with_tut[df_reviewers_with_tut.assigned_submission_ids.apply(len) > 0] + +solution = solve_milp( + df_reviewers_only_tut, + df_submissions_few_reviewers, + MIN_REVIEWS_PER_PERSON, + MAX_REVIEWS_PER_PERSON, + MIN_REVIEWERS_PER_SUBMISSION, + MAX_REVIEWERS_PER_SUBMISSION, + TUTORIAL_COEFF, + ASSIGN_TUTORIALS_TO_ANYONE, +) + +if solution is not None: + reviewers, submissions = format_and_output_result( + df_reviewers_only_tut, df_submissions_few_reviewers, solution, post_fix="02" + ) + +# %% +df = pd.DataFrame(submissions) +df = df.assign(num_reviewers=df.assigned_reviewer_ids.apply(len)) +df + +# %% +df = pd.DataFrame(reviewers) +df = df[["reviewer_id", "assigned_submission_ids"]] +df + +# %% +con.sql( + """ +create or replace table reviewer_assignments_02 as +select + reviewer_assignments_01.reviewer_id, tracks, conflicts_submission_ids, + list_concat(reviewer_assignments_01.assigned_submission_ids, df.assigned_submission_ids) as assigned_submission_ids +from reviewer_assignments_01 +left join df on df.reviewer_id = reviewer_assignments_01.reviewer_id +""" +) +con.sql("table reviewer_assignments_02") + +# %% +con.sql( + "select count(*), string_agg(reviewer_id), len(assigned_submission_ids) as num_submissions from reviewer_assignments_02 group by num_submissions" # noqa: E501 +) + +# %% +df = pd.DataFrame(submissions) +df + +# %% +con.sql( + """ +create or replace table submission_assignments_02 as +select submission_assignments_01.submission_id, submission_assignments_01.author_ids, submission_assignments_01.track, +list_concat(submission_assignments_01.assigned_reviewer_ids, df.assigned_reviewer_ids) as assigned_reviewer_ids +from submission_assignments_01 +left join df on df.submission_id = submission_assignments_01.submission_id +""" +) +con.sql("table submission_assignments_02") + +# %% [markdown] +# ## Final counts/checks + +# %% [markdown] +# All submissions have at least 3 reviewers + +# %% +con.sql( + """ +select string_agg(submission_id), count(track), len(assigned_reviewer_ids) from submission_assignments_02 group by len(assigned_reviewer_ids) +""" # noqa: E501 +) + +# %% [markdown] +# Step 1: Only tutorial assignments + +# %% +con.sql( + """ +select string_agg(reviewer_id), count(reviewer_id), string_agg(tracks), len(assigned_submission_ids) from reviewer_assignments_00 group by len(assigned_submission_ids) +""" # noqa: E501 +) + +# %% [markdown] +# Step 2: Add talks assignments + +# %% +con.sql( + """ +select string_agg(reviewer_id), count(reviewer_id), string_agg(tracks), len(assigned_submission_ids) from reviewer_assignments_01 group by len(assigned_submission_ids) +""" # noqa: E501 +) + +# %% [markdown] +# Step 3: Assign talks to tutorial reviewers + +# %% +con.sql( + """ +select string_agg(reviewer_id), count(reviewer_id), string_agg(tracks), len(assigned_submission_ids) from reviewer_assignments_02 group by len(assigned_submission_ids) +""" # noqa: E501 +) + +# %% +con.close() + +# %% [markdown] +# ## Final export + +# %% +database_file = "../data/assign_reviews.db" +con = duckdb.connect(database_file) + +# %% +reviewer_assignments_final = { + item["reviewer_id"]: item["assigned_submission_ids"] + for item in con.sql("table reviewer_assignments_02") + .df()[["reviewer_id", "assigned_submission_ids"]] + .to_dict("records") +} +with open("output/reviewer-assignments.json", "w") as fp: + fp.write(json.dumps(reviewer_assignments_final, indent=4)) + +# %% +con.close() diff --git a/pyproject.toml b/pyproject.toml index e16a72a..5743198 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,3 +17,8 @@ pythonpath = ['.'] [tool.coverage.report] show_missing = true + +[tool.jupytext] +# Always pair ipynb notebooks to py:percent files +formats = ["ipynb", "py:percent"] +notebook_metadata_filter = "all,-jupytext.text_representation.jupytext_version" diff --git a/requirements.lock b/requirements.lock index dd8c12b..aeb45fe 100644 --- a/requirements.lock +++ b/requirements.lock @@ -492,6 +492,15 @@ jupyterlab-server==2.25.4 \ # via # jupyterlab # notebook +jupytext==1.16.1 \ + --hash=sha256:68c7b68685e870e80e60fda8286fbd6269e9c74dc1df4316df6fe46eabc94c99 \ + --hash=sha256:796ec4f68ada663569e5d38d4ef03738a01284bfe21c943c485bc36433898bd0 +markdown-it-py==3.0.0 \ + --hash=sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1 \ + --hash=sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb + # via + # jupytext + # mdit-py-plugins markupsafe==2.1.5 \ --hash=sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf \ --hash=sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff \ @@ -562,6 +571,14 @@ matplotlib-inline==0.1.6 \ # via # ipykernel # ipython +mdit-py-plugins==0.4.0 \ + --hash=sha256:b51b3bb70691f57f974e257e367107857a93b36f322a9e6d44ca5bf28ec2def9 \ + --hash=sha256:d8ab27e9aed6c38aa716819fedfde15ca275715955f8a185a8e1cf90fb1d2c1b + # via jupytext +mdurl==0.1.2 \ + --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ + --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba + # via markdown-it-py mistune==3.0.2 \ --hash=sha256:71481854c30fdbc938963d3605b72501f5c10a9320ecd412c121c163a1c7d205 \ --hash=sha256:fc7f93ded930c92394ef2cb6f04a8aabab4117a91449e72dcc8dfa646a508be8 @@ -579,6 +596,7 @@ nbformat==5.10.3 \ --hash=sha256:d9476ca28676799af85385f409b49d95e199951477a159a576ef2a675151e5e8 # via # jupyter-server + # jupytext # nbclient # nbconvert nest-asyncio==1.6.0 \ @@ -646,6 +664,7 @@ packaging==24.0 \ # jupyter-server # jupyterlab # jupyterlab-server + # jupytext # nbconvert pandas==2.2.1 \ --hash=sha256:04f6ec3baec203c13e3f8b139fb0f9f86cd8c0b94603ae3ae8ce9a422e9f5bee \ @@ -806,7 +825,9 @@ pyyaml==6.0.1 \ --hash=sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585 \ --hash=sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d \ --hash=sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f - # via jupyter-events + # via + # jupyter-events + # jupytext pyzmq==25.1.2 \ --hash=sha256:004ff469d21e86f0ef0369717351073e0e577428e514c47c8480770d5e24a565 \ --hash=sha256:00a06faa7165634f0cac1abb27e54d7a0b3b44eb9994530b8ec73cf52e15353b \ @@ -1144,6 +1165,10 @@ tinycss2==1.2.1 \ --hash=sha256:2b80a96d41e7c3914b8cda8bc7f705a4d9c49275616e886103dd839dfc847847 \ --hash=sha256:8cff3a8f066c2ec677c06dbc7b45619804a6938478d9d73c284b29d14ecb0627 # via nbconvert +toml==0.10.2 \ + --hash=sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b \ + --hash=sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f + # via jupytext tornado==6.4 \ --hash=sha256:02ccefc7d8211e5a7f9e8bc3f9e5b0ad6262ba2fbb683a6443ecc804e5224ce0 \ --hash=sha256:10aeaa8006333433da48dec9fe417877f8bcc21f48dda8d661ae79da357b2a63 \ diff --git a/requirements.txt b/requirements.txt index f8cac43..a60c50d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ notebook jupyterlab duckdb duckdb_engine +jupytext