From 8740887f8a945df44f7f2b66fe8a870e786c3822 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Mon, 6 May 2024 16:24:21 -0400 Subject: [PATCH 1/2] Update lenskit predictions notebook and add to CI --- .github/workflows/unit_tests.yml | 2 +- requirements.txt | 3 +- .../LenskitRecommendations.ipynb | 2859 ++++++++--------- tests/test_utils/test_data.py | 10 +- utils/missing_data_processing.py | 22 +- 5 files changed, 1428 insertions(+), 1468 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 6f43c87..477b0b8 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -42,5 +42,5 @@ jobs: run: treon missing_data/BorrowingCapacity.ipynb missing_data/MissingBorrowingActivity.ipynb missing_data/MissingBooks.ipynb missing_data/MissingMembers.ipynb missing_data/MissingMembershipActivities.ipynb missing_data/BookcatalogBooksEstimate.ipynb - name: Run treon to check selected speculative_reading notebooks - run: treon speculative_reading/HemingwayBorrowing.ipynb speculative_reading/PartialBorrowers.ipynb + run: treon speculative_reading/HemingwayBorrowing.ipynb speculative_reading/PartialBorrowers.ipynb speculative_reading/CombineRecommendations.ipynb speculative_reading/LenskitRecommendations.ipynb diff --git a/requirements.txt b/requirements.txt index 7feb142..a3c63dd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,5 @@ seaborn great-tables # we used an unreleased version of copia with label options for plots git+https://github.com/mikekestemont/copia@3e57da4 -matplotlib==3.7 \ No newline at end of file +matplotlib==3.7 +scikit-learn \ No newline at end of file diff --git a/speculative_reading/LenskitRecommendations.ipynb b/speculative_reading/LenskitRecommendations.ipynb index 508b314..e7ce264 100644 --- a/speculative_reading/LenskitRecommendations.ipynb +++ b/speculative_reading/LenskitRecommendations.ipynb @@ -1,1495 +1,1452 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "I1x1AE35cOtj" - }, - "source": [ - "# Generate and Evaluate Lenskit Model Stability and Select Scores for Predictions" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EjZT_ntacTdr" - }, - "source": [ - "## Load libraries and data" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "kAuugX9YGLpu" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Lenskit is already installed.\n" - ] - } - ], - "source": [ - "# This code checks if the lenskit library is installed and installs it if it is not.\n", - "try:\n", - " import lenskit\n", - " print(\"Lenskit is already installed.\")\n", - "except ImportError:\n", - " print(\"Lenskit is not installed. Installing...\")\n", - " import subprocess\n", - " subprocess.check_call([\"pip\", \"install\", \"lenskit\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": { - "id": "Xo656ZZAGfei" - }, - "outputs": [], - "source": [ - "# Standard library imports\n", - "import os\n", - "import sys\n", - "from typing import List\n", - "\n", - "# Third party imports\n", - "import altair as alt # For data visualization\n", - "import numpy as np # For numerical operations\n", - "import pandas as pd # For data manipulation\n", - "from scipy.stats import zscore # For statistical computations\n", - "from sklearn.preprocessing import MinMaxScaler # For data preprocessing\n", - "from tqdm.notebook import tqdm # For progress bars\n", - "\n", - "# LensKit imports\n", - "from lenskit import Recommender, topn, util, batch, crossfold as xf # For recommendation systems\n", - "from lenskit.algorithms import als, basic # For recommendation algorithms\n", - "\n", - "# Local application/library specific imports\n", - "sys.path.append(\"..\")\n", - "from utils.missing_data_processing import * # For handling missing data" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "b5W2qUTyGSkv", - "outputId": "6d8bf784-b7a5-4df2-a71a-db8da792081b" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
useritemrating
2rhysconrad-typhoon1
8lanux-eyre-dewoolf-night-day1
12teryjames-joyce1
13teryfreeman-portrait-george-moore1
22macleish-adastern-tents-israel1
38alvearyeats-later-poems1
46joyce-jamesmantzius-history-theatrical-art1
51joyce-jamesscott-poems-walter-scott1
52joyce-jameschekhov-horse-stealers-stories1
53joyce-jamesstephens-crock-gold1
\n", - "
" - ], - "text/plain": [ - " user item rating\n", - "2 rhys conrad-typhoon 1\n", - "8 lanux-eyre-de woolf-night-day 1\n", - "12 tery james-joyce 1\n", - "13 tery freeman-portrait-george-moore 1\n", - "22 macleish-ada stern-tents-israel 1\n", - "38 alvear yeats-later-poems 1\n", - "46 joyce-james mantzius-history-theatrical-art 1\n", - "51 joyce-james scott-poems-walter-scott 1\n", - "52 joyce-james chekhov-horse-stealers-stories 1\n", - "53 joyce-james stephens-crock-gold 1" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Load the initial data into four DataFrames: events_df, members_df, books_df, and borrow_overrides_df.\n", - "events_df, members_df, books_df, borrow_overrides_df = load_initial_data()\n", - "\n", - "# Process the events data to clean it and prepare it for analysis.\n", - "events_df = preprocess_events_data(events_df)\n", - "\n", - "# Extract the item ID from the URI in the books DataFrame.\n", - "# The item ID is the second to last part of the URI.\n", - "books_df[\"item_id\"] = books_df.uri.apply(\n", - " lambda x: x.split(\"/\")[-2] if pd.notna(x) else None\n", - ")\n", - "\n", - "# Generate short IDs for the members in the members DataFrame.\n", - "# The ID is the second to last part of the URI.\n", - "members_df[\"id\"] = members_df.uri.apply(\n", - " lambda x: x.split(\"/\")[-2]\n", - ")\n", - "\n", - "# Get all member-book interactions from the events DataFrame.\n", - "# Only include rows where the item URI is not null.\n", - "interactions_df = events_df[events_df.item_uri.notna()].copy()\n", - "\n", - "# Restrict the interactions to borrow events only.\n", - "interactions_df = interactions_df[interactions_df.event_type == 'Borrow'].copy()\n", - "\n", - "# Reduce the interactions DataFrame to the minimum user/item interaction fields and drop duplicate rows.\n", - "unique_interactions_df = interactions_df[\n", - " [\"member_id\", \"item_id\"]\n", - "].drop_duplicates(subset=[\"member_id\", \"item_id\"])\n", - "\n", - "# Rename the columns to the names expected by LensKit.\n", - "# The DataFrame is renamed to 'ratings' for use with the tutorial.\n", - "ratings = unique_interactions_df.rename(columns={'member_id': 'user', 'item_id': 'item'})\n", - "\n", - "# The example assumes a rating. Use a 1/0 rating and set all to 1 to confirm interaction.\n", - "ratings['rating'] = 1\n", - "\n", - "# Display the first 10 rows of the ratings DataFrame.\n", - "ratings.head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
member_idsubscription_startsubscription_endsubscription_eventssubscription_volumessubscription_daysinternal_gapsknown_borrows
23raphael-france1920-04-301921-11-17Subscription;Renewal;Renewal;Renewal1.05660;0;01008
\n", - "
" - ], - "text/plain": [ - " member_id subscription_start subscription_end \\\n", - "23 raphael-france 1920-04-30 1921-11-17 \n", - "\n", - " subscription_events subscription_volumes \\\n", - "23 Subscription;Renewal;Renewal;Renewal 1.0 \n", - "\n", - " subscription_days internal_gaps known_borrows \n", - "23 566 0;0;0 1008 " - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# load previously computed partial borrowers list (sequential / near sequential subscriptions collapsed)\n", - "partial_borrowers = pd.read_csv('../appendix/speculative_reading/data/partial_borrowers_collapsed.csv')\n", - "partial_borrowers.sort_values('known_borrows', ascending=False, inplace=True)\n", - "# parse subscription dates so we can use them to identify circulating books\n", - "partial_borrowers['subscription_start'] = pd.to_datetime(partial_borrowers['subscription_start'])\n", - "partial_borrowers['subscription_end'] = pd.to_datetime(partial_borrowers['subscription_end'])\n", - "partial_borrowers.head(1)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "# generate subset of events dataset with dates, for use in identifying books \n", - "# in circulation during and before these subscriptions\n", - "\n", - "dated_events_df = events_df.copy()\n", - "dated_events_df['start_date_dt'] = pd.to_datetime(dated_events_df['start_date'], errors='coerce')\n", - "dated_events_df['end_date_dt'] = pd.to_datetime(dated_events_df['end_date'], errors='coerce')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2IwUVNfddJ4_" - }, - "source": [ - "## Fit initial model" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vYH5o_UqdV2H" - }, - "source": [ - "### Run Model Comparisons" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# Define constants\n", - "N_RECOMMENDATIONS = 20\n", - "\n", - "def get_item_ids(user_id: str, bookless_sub: pd.Series, dated_events_df: pd.DataFrame, events_df: pd.DataFrame) -> List[str]:\n", - "\t\"\"\"\n", - "\tGet the item IDs for books that were in circulation during the subscription period\n", - "\n", - "\tParameters\n", - "\t----------\n", - "\tuser_id : str\n", - "\t\tThe user ID for the member\n", - "\tbookless_sub : pd.Series\n", - "\t\tA Series with the subscription start and end dates\n", - "\tdated_events_df : pd.DataFrame\n", - "\t\tA DataFrame with the events data and dates\n", - "\tevents_df : pd.DataFrame\n", - "\t\tA DataFrame with the events data\n", - "\t\n", - "\tReturns\n", - "\t-------\n", - "\tList[str]\n", - "\t\tA list of item IDs for books that were in circulation during the subscription period\n", - "\t\n", - "\t\"\"\"\n", - "\tcirculating_book_events = dated_events_df[(dated_events_df.start_date_dt < bookless_sub.subscription_end) | (dated_events_df.end_date_dt < bookless_sub.subscription_end)]\n", - "\titem_ids = circulating_book_events[circulating_book_events.item_id.notna()].item_id.unique()\n", - "\tmember_book_ids = events_df[(events_df.item_id.notna()) & (events_df.member_id.str.contains(user_id))].item_id.unique()\n", - "\tsubset_item_ids = list(set(item_ids) - set(member_book_ids))\n", - "\treturn subset_item_ids\n", - "\n", - "def get_predictions(user_id: str, bookless_sub: pd.Series, rec: Recommender, subset_item_ids: List[str]) -> pd.DataFrame:\n", - "\t\"\"\"\n", - "\tGet the recommendations for a user\n", - "\n", - "\tParameters\n", - "\t----------\n", - "\tuser_id : str\n", - "\t\tThe user ID for the member\n", - "\tbookless_sub : pd.Series\n", - "\t\tA Series with the subscription start and end dates\n", - "\trec : Recommender\n", - "\t\tThe recommender model\n", - "\tsubset_item_ids : List[str]\n", - "\t\tA list of item IDs for books that were in circulation during the subscription period\n", - "\t\n", - "\tReturns\n", - "\t-------\n", - "\tpd.DataFrame\n", - "\t\tA DataFrame with the recommendations for the user\n", - "\n", - "\t\"\"\"\n", - "\tpredictions = rec.recommend(user_id, candidates=subset_item_ids)\n", - "\tpredictions['member_id'] = user_id\n", - "\tpredictions['subscription_start'] = bookless_sub.subscription_start\n", - "\tpredictions['subscription_end'] = bookless_sub.subscription_end\n", - "\tpredictions.rename(columns={'item': 'item_id'}, inplace=True)\n", - "\treturn predictions\n", - "\n", - "def run_model_comparisons(number_of_runs: List[int], return_scores: bool, output_path: str, members: List[str]) -> pd.DataFrame:\n", - "\t\"\"\"\n", - "\tRun model comparisons for a list of run lengths\n", - "\n", - "\tParameters\n", - "\t----------\n", - "\tnumber_of_runs : List[int]\n", - "\t\tA list of run lengths\n", - "\treturn_scores : bool\n", - "\t\tA boolean indicating whether to return scores\n", - "\toutput_path : str\n", - "\t\tThe path to save the output\n", - "\tmembers : List[str]\n", - "\t\tA list of member IDs\n", - "\t\n", - "\tReturns\n", - "\t-------\n", - "\tpd.DataFrame\n", - "\t\tA DataFrame with the model comparisons\n", - "\t\"\"\"\n", - "\tif os.path.exists(output_path):\n", - "\t\tcompare_models = pd.read_csv(output_path)\n", - "\telse: \n", - "\t\tmodel_runs=[]\n", - "\t\tfor run_length in number_of_runs:\n", - "\t\t\tall_recs = []\n", - "\t\t\tfor index in tqdm(range(run_length)):\n", - "\t\t\t\trec = Recommender.adapt(als.ImplicitMF(50, use_ratings=False))\n", - "\t\t\t\trec.fit(ratings)\n", - "\t\t\t\tpopular = Recommender.adapt(basic.Popular())\n", - "\t\t\t\tpopular.fit(ratings)\n", - "\t\t\t\tfor bookless_sub in list(partial_borrowers.itertuples()):\n", - "\t\t\t\t\tuser_id = bookless_sub.member_id\n", - "\t\t\t\t\tif user_id in members:\n", - "\t\t\t\t\t\tsubset_item_ids = get_item_ids(user_id, bookless_sub, dated_events_df, events_df)\n", - "\t\t\t\t\t\tpredictions = get_predictions(user_id, bookless_sub, rec, subset_item_ids)\n", - "\t\t\t\t\t\tpredictions['model_run'] = index\n", - "\t\t\t\t\t\tall_recs.append(predictions)\n", - "\t\t\tall_recs_df = pd.concat(all_recs)\n", - "\t\t\tmetrics_df = all_recs_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id']).agg({'score': [np.median, 'skew', 'std', 'var']}).reset_index()\n", - "\t\t\tmetrics_df.columns = list(map(''.join, metrics_df.columns.values))\n", - "\t\t\tmetrics_df.columns = [col if 'score' not in col else col.split('score')[1] for col in metrics_df.columns ]\n", - "\t\t\tkurt_df = all_recs_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].apply(pd.DataFrame.kurt).reset_index(name='kurtosis')\n", - "\t\t\tfinal_df = pd.merge(metrics_df, kurt_df, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])\n", - "\t\t\tfinal_df['model_loops'] = run_length\n", - "\t\t\tif return_scores:\n", - "\t\t\t\tfinal_df = pd.merge(final_df, all_recs_df, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'], how='left')\n", - "\t\t\tmodel_runs.append(final_df)\n", - "\t\t\tcompare_models = pd.concat(model_runs)\n", - "\t\t\tcompare_models.to_csv(output_path, index=False)\n", - "\treturn compare_models" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "id": "Cpeze52F_UpV" - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "6fd97b547b6d461d881f3f9053142320", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/10 [00:00 pd.DataFrame:\n", - " \"\"\"\n", - " This function samples scores from a DataFrame for a given number of books and periods.\n", - " \n", - " Parameters:\n", - " df (pd.DataFrame): The DataFrame containing the scores.\n", - " get_top (bool): If True, the function will return the top scores. If False, it will return random scores.\n", - " numb_of_books (int): The number of books to sample scores for.\n", - " \n", - " Returns:\n", - " pd.DataFrame: A DataFrame containing the sampled scores.\n", - " \"\"\"\n", - " \n", - " # Get the unique periods from the DataFrame.\n", - " periods = df.member_period.unique().tolist()\n", - " \n", - " # Initialize an empty list to store the DataFrames for each period.\n", - " visualize_df = []\n", - " \n", - " # For each period...\n", - " for period in periods:\n", - " # Initialize an empty list to store the books for this period.\n", - " final_books = []\n", - " \n", - " # Get the rows from the DataFrame for this period.\n", - " rows = df[df.member_period == period]\n", - " \n", - " # Get the unique loop numbers from the rows.\n", - " loops = rows.model_loops.unique().tolist()\n", - " \n", - " # While the number of books is less than the specified number...\n", - " while len(final_books) < numb_of_books:\n", - " # For each loop...\n", - " for loop in loops:\n", - " # Get the rows for this loop.\n", - " final_rows = rows[rows.model_loops == loop]\n", - " \n", - " # If get_top is True, sort the rows by median score in descending order and get the top books.\n", - " # Otherwise, get a random sample of books.\n", - " if get_top:\n", - " final_rows = final_rows.sort_values(by='median', ascending=False)\n", - " books = final_rows[0:numb_of_books].item_id.unique().tolist()\n", - " else:\n", - " books = rows.item_id.sample(n=numb_of_books).reset_index()\n", - " books = books.item_id.unique().tolist()\n", - " \n", - " # If the number of books is less than the specified number, add more books until the number is reached.\n", - " increment = numb_of_books\n", - " while len(books) < numb_of_books:\n", - " increment = increment + 1\n", - " books = final_rows[0:increment].item_id.unique().tolist()\n", - " \n", - " # Add the books to the list of books for this period.\n", - " final_books.extend(books)\n", - " \n", - " # Remove duplicate books from the list.\n", - " final_books = list(set(final_books))\n", - " \n", - " # Add the rows for the books in the list to the list of DataFrames.\n", - " visualize_df.append(rows[rows.item_id.isin(set(final_books))])\n", - " \n", - " # Concatenate the DataFrames in the list into a single DataFrame.\n", - " final_df = pd.concat(visualize_df)\n", - " \n", - " return final_df" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "id": "a5hqOlTNbHgR" - }, - "outputs": [], - "source": [ - "def visualize_model_stability(df: pd.DataFrame, get_top: bool, numb_of_books: int) -> alt.Chart:\n", - " \"\"\"\n", - " This function visualizes the stability of a model by creating box plots and scatter plots of various score distribution metrics.\n", - " \n", - " Parameters:\n", - " df (pd.DataFrame): The DataFrame containing the scores.\n", - " get_top (bool): If True, the function will return the top scores. If False, it will return random scores.\n", - " numb_of_books (int): The number of books to sample scores for.\n", - " \n", - " Returns:\n", - " alt.Chart: A concatenated Altair chart containing the box plots and scatter plots.\n", - " \"\"\"\n", - " \n", - " # Sample scores from the DataFrame.\n", - " sample_df = sample_scores(df, get_top, numb_of_books)\n", - " \n", - " # Define the distribution metrics to be used.\n", - " distribution_metrics = ['median', 'skew', 'std', 'var', 'kurtosis']\n", - " \n", - " # Normalize the distribution metrics in the sample DataFrame using MinMaxScaler.\n", - " sample_df[distribution_metrics] = MinMaxScaler().fit_transform(sample_df[distribution_metrics])\n", - " \n", - " # Melt the sample DataFrame to a long format for visualization.\n", - " melted_sample = pd.melt(sample_df, id_vars=['member_id', 'subscription_start', 'subscription_end', 'item_id', 'model_loops', \n", - " 'member_period'], value_vars=['median', 'skew', 'std', 'var', 'kurtosis'])\n", - "\n", - " # Create a box plot of the distribution metrics.\n", - " boxplot = alt.Chart(melted_sample).mark_boxplot().encode(\n", - " x= alt.X('model_loops:O', axis=alt.Axis(title='')),\n", - " y=alt.Y('value', axis=alt.Axis(title='')),\n", - " column=alt.Column('variable', title=''),\n", - " ).properties(title = \"Variability with Box and Whiskers\")\n", - "\n", - " # Create a scatter plot of the distribution metrics.\n", - " points = alt.Chart(melted_sample).mark_circle().encode(\n", - " x= alt.X('model_loops:O', axis=alt.Axis(title='')),\n", - " y=alt.Y('value', axis=alt.Axis(title='')),\n", - " color=alt.Color('variable', legend=alt.Legend(title=['Measure of', 'Score Variability'])), \n", - " column=alt.Column('variable', title='')\n", - " ).properties(title = \"Variability with Score Distributions\")\n", - "\n", - " # Concatenate the box plot and scatter plot horizontally and return the result.\n", - " return alt.hconcat(boxplot, points).properties(title='Variability in Predicted Scores By Resampling Implicit Matrix Factorization Model ')" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 428 - }, - "id": "HyS03DjrbWKs", - "outputId": "ce98fa42-3e18-4f1b-8a69-a36cb9d9fc3f" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.HConcatChart(...)" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chart = visualize_model_stability(compare_models, True, 10)\n", - "chart.configure_axisX(\n", - " labelAngle=0\n", - ").configure_title(anchor='middle')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DwEwSsFdjD7r" - }, - "source": [ - "### Select Optimal Model and Generate Item Scores" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "id": "_Z3-aAZHdr4p" - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "db7232cf2a7b4c2294eca12b03b7c5aa", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/100 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
useritemrating
2rhysconrad-typhoon1
8lanux-eyre-dewoolf-night-day1
12teryjames-joyce1
13teryfreeman-portrait-george-moore1
22macleish-adastern-tents-israel1
38alvearyeats-later-poems1
46joyce-jamesmantzius-history-theatrical-art1
51joyce-jamesscott-poems-walter-scott1
52joyce-jameschekhov-horse-stealers-stories1
53joyce-jamesstephens-crock-gold1
\n", + "" ], - "source": [ - "final_model[['member_id', 'subscription_start', 'subscription_end']].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "id": "Uluk5V6yF9xR" - }, - "outputs": [], - "source": [ - "final_model['member_period'] = final_model.member_id + ': ' + final_model.subscription_start.astype(str) + '/' + final_model.subscription_end.astype(str)" + "text/plain": [ + " user item rating\n", + "2 rhys conrad-typhoon 1\n", + "8 lanux-eyre-de woolf-night-day 1\n", + "12 tery james-joyce 1\n", + "13 tery freeman-portrait-george-moore 1\n", + "22 macleish-ada stern-tents-israel 1\n", + "38 alvear yeats-later-poems 1\n", + "46 joyce-james mantzius-history-theatrical-art 1\n", + "51 joyce-james scott-poems-walter-scott 1\n", + "52 joyce-james chekhov-horse-stealers-stories 1\n", + "53 joyce-james stephens-crock-gold 1" ] - }, + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load the initial data into four DataFrames: events_df, members_df, books_df, and borrow_overrides_df.\n", + "data = get_preprocessed_data()\n", + "events_df = data[\"events\"]\n", + "members_df = data[\"members\"]\n", + "books_df = data[\"books\"]\n", + "# borrow_overrides unused in this notebook\n", + "\n", + "# Get all member-book interactions from the events DataFrame.\n", + "# Only include rows where the item URI is not null.\n", + "interactions_df = events_df[events_df.item_uri.notna()].copy()\n", + "\n", + "# Restrict the interactions to borrow events only.\n", + "interactions_df = interactions_df[interactions_df.event_type == 'Borrow'].copy()\n", + "\n", + "# Reduce the interactions DataFrame to the minimum user/item interaction fields and drop duplicate rows.\n", + "unique_interactions_df = interactions_df[\n", + " [\"member_id\", \"item_id\"]\n", + "].drop_duplicates(subset=[\"member_id\", \"item_id\"])\n", + "\n", + "# Rename the columns to the names expected by LensKit.\n", + "# The DataFrame is renamed to 'ratings' for use with the tutorial.\n", + "ratings = unique_interactions_df.rename(columns={'member_id': 'user', 'item_id': 'item'})\n", + "\n", + "# The example assumes a rating. Use a 1/0 rating and set all to 1 to confirm interaction.\n", + "ratings['rating'] = 1\n", + "\n", + "# Display the first 10 rows of the ratings DataFrame.\n", + "ratings.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "juCy1eUbHsuc", - "outputId": "bf9cdb0d-cbde-4430-a133-b832d6442fde" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "member_period\n", - "hemingway-ernest: 1921-12-28/1922-11-08 654\n", - "hemingway-ernest: 1924-03-28/1925-03-28 1285\n", - "Name: item_id, dtype: int64" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
member_idsubscription_startsubscription_endsubscription_eventssubscription_volumessubscription_daysinternal_gapsknown_borrows
23raphael-france1920-04-301921-11-17Subscription;Renewal;Renewal;Renewal1.05660;0;01008
86kittredge-eleanor-hayden1924-01-171924-05-17Subscription;Renewal2.01210583
89kittredge-eleanor-hayden1929-09-101929-12-10Subscription2.091NaN583
\n", + "
" ], - "source": [ - "final_model[(final_model.member_id == 'hemingway-ernest') & (final_model.model_run ==0)].groupby('member_period')['item_id'].nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": { - "id": "RWdlg9pmLadd" - }, - "outputs": [], - "source": [ - "subset_model = final_model[final_model.member_id == 'hemingway-ernest']" + "text/plain": [ + " member_id subscription_start subscription_end \\\n", + "23 raphael-france 1920-04-30 1921-11-17 \n", + "86 kittredge-eleanor-hayden 1924-01-17 1924-05-17 \n", + "89 kittredge-eleanor-hayden 1929-09-10 1929-12-10 \n", + "\n", + " subscription_events subscription_volumes \\\n", + "23 Subscription;Renewal;Renewal;Renewal 1.0 \n", + "86 Subscription;Renewal 2.0 \n", + "89 Subscription 2.0 \n", + "\n", + " subscription_days internal_gaps known_borrows \n", + "23 566 0;0;0 1008 \n", + "86 121 0 583 \n", + "89 91 NaN 583 " ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# load previously computed partial borrowers list (sequential / near sequential subscriptions collapsed)\n", + "partial_borrowers = pd.read_csv(DATA_DIR / 'partial_borrowers_collapsed.csv')\n", + "partial_borrowers.sort_values('known_borrows', ascending=False, inplace=True)\n", + "# parse subscription dates so we can use them to identify circulating books\n", + "partial_borrowers['subscription_start'] = pd.to_datetime(partial_borrowers['subscription_start'])\n", + "partial_borrowers['subscription_end'] = pd.to_datetime(partial_borrowers['subscription_end'])\n", + "partial_borrowers.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# generate subset of events dataset with dates, for use in identifying books \n", + "# in circulation during and before these subscriptions\n", + "\n", + "dated_events_df = events_df.copy()\n", + "dated_events_df['start_date_dt'] = pd.to_datetime(dated_events_df['start_date'], errors='coerce')\n", + "dated_events_df['end_date_dt'] = pd.to_datetime(dated_events_df['end_date'], errors='coerce')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2IwUVNfddJ4_" + }, + "source": [ + "## Fit initial model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vYH5o_UqdV2H" + }, + "source": [ + "### Run Model Comparisons" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Define constants\n", + "N_RECOMMENDATIONS = 20\n", + "\n", + "def get_item_ids(user_id: str, bookless_sub: pd.Series, dated_events_df: pd.DataFrame, events_df: pd.DataFrame) -> List[str]:\n", + "\t\"\"\"\n", + "\tGet the item IDs for books that were in circulation during the subscription period\n", + "\n", + "\tParameters\n", + "\t----------\n", + "\tuser_id : str\n", + "\t\tThe user ID for the member\n", + "\tbookless_sub : pd.Series\n", + "\t\tA Series with the subscription start and end dates\n", + "\tdated_events_df : pd.DataFrame\n", + "\t\tA DataFrame with the events data and dates\n", + "\tevents_df : pd.DataFrame\n", + "\t\tA DataFrame with the events data\n", + "\t\n", + "\tReturns\n", + "\t-------\n", + "\tList[str]\n", + "\t\tA list of item IDs for books that were in circulation during the subscription period\n", + "\t\n", + "\t\"\"\"\n", + "\tcirculating_book_events = dated_events_df[(dated_events_df.start_date_dt < bookless_sub.subscription_end) | (dated_events_df.end_date_dt < bookless_sub.subscription_end)]\n", + "\titem_ids = circulating_book_events[circulating_book_events.item_id.notna()].item_id.unique()\n", + "\tmember_book_ids = events_df[(events_df.item_id.notna()) & (events_df.member_id.str.contains(user_id))].item_id.unique()\n", + "\tsubset_item_ids = list(set(item_ids) - set(member_book_ids))\n", + "\treturn subset_item_ids\n", + "\n", + "def get_predictions(user_id: str, bookless_sub: pd.Series, rec: Recommender, subset_item_ids: List[str]) -> pd.DataFrame:\n", + "\t\"\"\"\n", + "\tGet the recommendations for a user\n", + "\n", + "\tParameters\n", + "\t----------\n", + "\tuser_id : str\n", + "\t\tThe user ID for the member\n", + "\tbookless_sub : pd.Series\n", + "\t\tA Series with the subscription start and end dates\n", + "\trec : Recommender\n", + "\t\tThe recommender model\n", + "\tsubset_item_ids : List[str]\n", + "\t\tA list of item IDs for books that were in circulation during the subscription period\n", + "\t\n", + "\tReturns\n", + "\t-------\n", + "\tpd.DataFrame\n", + "\t\tA DataFrame with the recommendations for the user\n", + "\n", + "\t\"\"\"\n", + "\tpredictions = rec.recommend(user_id, candidates=subset_item_ids)\n", + "\tpredictions['member_id'] = user_id\n", + "\tpredictions['subscription_start'] = bookless_sub.subscription_start\n", + "\tpredictions['subscription_end'] = bookless_sub.subscription_end\n", + "\tpredictions.rename(columns={'item': 'item_id'}, inplace=True)\n", + "\treturn predictions\n", + "\n", + "def run_model_comparisons(number_of_runs: List[int], return_scores: bool, output_path: str, members: List[str]) -> pd.DataFrame:\n", + "\t\"\"\"\n", + "\tRun model comparisons for a list of run lengths\n", + "\n", + "\tParameters\n", + "\t----------\n", + "\tnumber_of_runs : List[int]\n", + "\t\tA list of run lengths\n", + "\treturn_scores : bool\n", + "\t\tA boolean indicating whether to return scores\n", + "\toutput_path : str\n", + "\t\tThe path to save the output\n", + "\tmembers : List[str]\n", + "\t\tA list of member IDs\n", + "\t\n", + "\tReturns\n", + "\t-------\n", + "\tpd.DataFrame\n", + "\t\tA DataFrame with the model comparisons\n", + "\t\"\"\"\n", + "\tif os.path.exists(output_path):\n", + "\t\tcompare_models = pd.read_csv(output_path)\n", + "\telse: \n", + "\t\tmodel_runs=[]\n", + "\t\tfor run_length in number_of_runs:\n", + "\t\t\tall_recs = []\n", + "\t\t\tfor index in tqdm(range(run_length)):\n", + "\t\t\t\trec = Recommender.adapt(als.ImplicitMF(50, use_ratings=False))\n", + "\t\t\t\trec.fit(ratings)\n", + "\t\t\t\tpopular = Recommender.adapt(basic.Popular())\n", + "\t\t\t\tpopular.fit(ratings)\n", + "\t\t\t\tfor bookless_sub in list(partial_borrowers.itertuples()):\n", + "\t\t\t\t\tuser_id = bookless_sub.member_id\n", + "\t\t\t\t\tif user_id in members:\n", + "\t\t\t\t\t\tsubset_item_ids = get_item_ids(user_id, bookless_sub, dated_events_df, events_df)\n", + "\t\t\t\t\t\tpredictions = get_predictions(user_id, bookless_sub, rec, subset_item_ids)\n", + "\t\t\t\t\t\tpredictions['model_run'] = index\n", + "\t\t\t\t\t\tall_recs.append(predictions)\n", + "\t\t\tall_recs_df = pd.concat(all_recs)\n", + "\t\t\tmetrics_df = all_recs_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id']).agg({'score': [np.median, 'skew', 'std', 'var']}).reset_index()\n", + "\t\t\tmetrics_df.columns = list(map(''.join, metrics_df.columns.values))\n", + "\t\t\tmetrics_df.columns = [col if 'score' not in col else col.split('score')[1] for col in metrics_df.columns ]\n", + "\t\t\tkurt_df = all_recs_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].apply(pd.DataFrame.kurt).reset_index(name='kurtosis')\n", + "\t\t\tfinal_df = pd.merge(metrics_df, kurt_df, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])\n", + "\t\t\tfinal_df['model_loops'] = run_length\n", + "\t\t\tif return_scores:\n", + "\t\t\t\tfinal_df = pd.merge(final_df, all_recs_df, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'], how='left')\n", + "\t\t\tmodel_runs.append(final_df)\n", + "\t\t\tcompare_models = pd.concat(model_runs)\n", + "\t\t\tcompare_models.to_csv(output_path, index=False)\n", + "\treturn compare_models" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "Cpeze52F_UpV" + }, + "outputs": [], + "source": [ + "# specify run size in number of runs\n", + "number_of_runs = [10,20,50,100, 200]\n", + "# specify members\n", + "members = ['kittredge-eleanor-hayden', 'colens-fernand', 'raphael-france', 'hemingway-ernest']\n", + "compare_models = run_model_comparisons(number_of_runs, False, './data/lenskit_comparison_model_runs.csv', members)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "COMVpXwHdZPp" + }, + "source": [ + "### Visualize stability of model scores" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "5ozr3kDtaM6U" + }, + "outputs": [], + "source": [ + "compare_models['member_period'] = compare_models.member_id + ': ' + compare_models.subscription_start.astype(str) + '/' + compare_models.subscription_end.astype(str)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "dVc1uhYTaSek" + }, + "outputs": [], + "source": [ + "def sample_scores(df: pd.DataFrame, get_top: bool, numb_of_books: int) -> pd.DataFrame:\n", + " \"\"\"\n", + " This function samples scores from a DataFrame for a given number of books and periods.\n", + " \n", + " Parameters:\n", + " df (pd.DataFrame): The DataFrame containing the scores.\n", + " get_top (bool): If True, the function will return the top scores. If False, it will return random scores.\n", + " numb_of_books (int): The number of books to sample scores for.\n", + " \n", + " Returns:\n", + " pd.DataFrame: A DataFrame containing the sampled scores.\n", + " \"\"\"\n", + " \n", + " # Get the unique periods from the DataFrame.\n", + " periods = df.member_period.unique().tolist()\n", + " \n", + " # Initialize an empty list to store the DataFrames for each period.\n", + " visualize_df = []\n", + " \n", + " # For each period...\n", + " for period in periods:\n", + " # Initialize an empty list to store the books for this period.\n", + " final_books = []\n", + " \n", + " # Get the rows from the DataFrame for this period.\n", + " rows = df[df.member_period == period]\n", + " \n", + " # Get the unique loop numbers from the rows.\n", + " loops = rows.model_loops.unique().tolist()\n", + " \n", + " # While the number of books is less than the specified number...\n", + " while len(final_books) < numb_of_books:\n", + " # For each loop...\n", + " for loop in loops:\n", + " # Get the rows for this loop.\n", + " final_rows = rows[rows.model_loops == loop]\n", + " \n", + " # If get_top is True, sort the rows by median score in descending order and get the top books.\n", + " # Otherwise, get a random sample of books.\n", + " if get_top:\n", + " final_rows = final_rows.sort_values(by='median', ascending=False)\n", + " books = final_rows[0:numb_of_books].item_id.unique().tolist()\n", + " else:\n", + " books = rows.item_id.sample(n=numb_of_books).reset_index()\n", + " books = books.item_id.unique().tolist()\n", + " \n", + " # If the number of books is less than the specified number, add more books until the number is reached.\n", + " increment = numb_of_books\n", + " while len(books) < numb_of_books:\n", + " increment = increment + 1\n", + " books = final_rows[0:increment].item_id.unique().tolist()\n", + " \n", + " # Add the books to the list of books for this period.\n", + " final_books.extend(books)\n", + " \n", + " # Remove duplicate books from the list.\n", + " final_books = list(set(final_books))\n", + " \n", + " # Add the rows for the books in the list to the list of DataFrames.\n", + " visualize_df.append(rows[rows.item_id.isin(set(final_books))])\n", + " \n", + " # Concatenate the DataFrames in the list into a single DataFrame.\n", + " final_df = pd.concat(visualize_df)\n", + " \n", + " return final_df" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "a5hqOlTNbHgR" + }, + "outputs": [], + "source": [ + "def visualize_model_stability(df: pd.DataFrame, get_top: bool, numb_of_books: int) -> alt.Chart:\n", + " \"\"\"\n", + " This function visualizes the stability of a model by creating box plots and scatter plots of various score distribution metrics.\n", + " \n", + " Parameters:\n", + " df (pd.DataFrame): The DataFrame containing the scores.\n", + " get_top (bool): If True, the function will return the top scores. If False, it will return random scores.\n", + " numb_of_books (int): The number of books to sample scores for.\n", + " \n", + " Returns:\n", + " alt.Chart: A concatenated Altair chart containing the box plots and scatter plots.\n", + " \"\"\"\n", + " \n", + " # Sample scores from the DataFrame.\n", + " sample_df = sample_scores(df, get_top, numb_of_books)\n", + " \n", + " # Define the distribution metrics to be used.\n", + " distribution_metrics = ['median', 'skew', 'std', 'var', 'kurtosis']\n", + " \n", + " # Normalize the distribution metrics in the sample DataFrame using MinMaxScaler.\n", + " sample_df[distribution_metrics] = MinMaxScaler().fit_transform(sample_df[distribution_metrics])\n", + " \n", + " # Melt the sample DataFrame to a long format for visualization.\n", + " melted_sample = pd.melt(sample_df, id_vars=['member_id', 'subscription_start', 'subscription_end', 'item_id', 'model_loops', \n", + " 'member_period'], value_vars=['median', 'skew', 'std', 'var', 'kurtosis'])\n", + "\n", + " # Create a box plot of the distribution metrics.\n", + " boxplot = alt.Chart(melted_sample).mark_boxplot().encode(\n", + " x= alt.X('model_loops:O', axis=alt.Axis(title='')),\n", + " y=alt.Y('value', axis=alt.Axis(title='')),\n", + " column=alt.Column('variable', title=''),\n", + " ).properties(title = \"Variability with Box and Whiskers\")\n", + "\n", + " # Create a scatter plot of the distribution metrics.\n", + " points = alt.Chart(melted_sample).mark_circle().encode(\n", + " x= alt.X('model_loops:O', axis=alt.Axis(title='')),\n", + " y=alt.Y('value', axis=alt.Axis(title='')),\n", + " color=alt.Color('variable', legend=alt.Legend(title=['Measure of', 'Score Variability'])), \n", + " column=alt.Column('variable', title='')\n", + " ).properties(title = \"Variability with Score Distributions\")\n", + "\n", + " # Concatenate the box plot and scatter plot horizontally and return the result.\n", + " return alt.hconcat(boxplot, points).properties(title='Variability in Predicted Scores By Resampling Implicit Matrix Factorization Model ')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 428 }, + "id": "HyS03DjrbWKs", + "outputId": "ce98fa42-3e18-4f1b-8a69-a36cb9d9fc3f" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 33, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 179 - }, - "id": "DIh_AnSMGGbt", - "outputId": "d670a3a7-83e0-4774-86a1-846619e5425d" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
member_idsubscription_startsubscription_enditem_idmedianskewstdvarkurtosismodel_loopsscoremodel_runmember_period
7200hemingway-ernest1921-12-281922-11-08burney-evelina-history-young0.320107-0.0480810.189160.0357810.0238061000.8098430hemingway-ernest: 1921-12-28/1922-11-08
\n", - "
" - ], - "text/plain": [ - " member_id subscription_start subscription_end \\\n", - "7200 hemingway-ernest 1921-12-28 1922-11-08 \n", - "\n", - " item_id median skew std var \\\n", - "7200 burney-evelina-history-young 0.320107 -0.048081 0.18916 0.035781 \n", - "\n", - " kurtosis model_loops score model_run \\\n", - "7200 0.023806 100 0.809843 0 \n", - "\n", - " member_period \n", - "7200 hemingway-ernest: 1921-12-28/1922-11-08 " - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "\n", + "
\n", + "" ], - "source": [ - "top_results = sample_scores(subset_model, True, 36)\n", - "top_results[0:1]" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": { - "id": "D7ACefq438yi" - }, - "outputs": [], - "source": [ - "def get_formatted_titles(row):\n", - "\n", - " item = books_df[books_df.item_id == row.item_id]\n", - " if item.author.isna().any() == False:\n", - " author = ' '.join(item.author.str.split(',').values[0][::-1])\n", - " author = ' by' + author\n", - " else: \n", - " author = '(Periodical)'\n", - " title = item.title.values[0]\n", - " return title + author" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "id": "CGqXVIA64og0" - }, - "outputs": [], - "source": [ - "top_results['formatted_title'] = top_results.apply(get_formatted_titles, axis=1)" + "text/plain": [ + "alt.HConcatChart(...)" ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chart = visualize_model_stability(compare_models, True, 10)\n", + "chart.configure_axisX(\n", + " labelAngle=0\n", + ").configure_title(anchor='middle')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DwEwSsFdjD7r" + }, + "source": [ + "### Select Optimal Model and Generate Item Scores" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "_Z3-aAZHdr4p" + }, + "outputs": [], + "source": [ + "final_run = [100]\n", + "members = ['hemingway-ernest']\n", + "final_model = run_model_comparisons(final_run, True, f'./data/lenskit_model{str(final_run[0])}_scores.csv', members)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "BOacDcQIjNb9" + }, + "outputs": [], + "source": [ + "member_subscriptions = final_model[['member_id', 'subscription_start', 'subscription_end']].drop_duplicates()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "XVVmXsmvrNus", + "outputId": "e0d5e8e3-d06e-4937-dc56-bd4ebcfec1a0" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 37, - "metadata": { - "id": "nqiDWqem6Xw1" - }, - "outputs": [], - "source": [ - "top_results['period'] = top_results.member_period.str.split(':').str[1]" + "data": { + "text/plain": [ + "member_id subscription_start subscription_end\n", + "hemingway-ernest 1924-03-28 1925-03-28 128500\n", + " 1921-12-28 1922-11-08 65400\n", + "Name: count, dtype: int64" ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_model[['member_id', 'subscription_start', 'subscription_end']].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "Uluk5V6yF9xR" + }, + "outputs": [], + "source": [ + "final_model['member_period'] = final_model.member_id + ': ' + final_model.subscription_start.astype(str) + '/' + final_model.subscription_end.astype(str)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "juCy1eUbHsuc", + "outputId": "bf9cdb0d-cbde-4430-a133-b832d6442fde" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 38, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "zbG_i17NCNpn", - "outputId": "11ee943b-20db-47fc-f627-2e01ab4ab4da" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(59, 52)" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(top_results.item_id.unique()), len(top_results[top_results.formatted_title.str.contains('Periodical') == False].item_id.unique())" + "data": { + "text/plain": [ + "member_period\n", + "hemingway-ernest: 1921-12-28/1922-11-08 654\n", + "hemingway-ernest: 1924-03-28/1925-03-28 1285\n", + "Name: item_id, dtype: int64" ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_model[(final_model.member_id == 'hemingway-ernest') & (final_model.model_run ==0)].groupby('member_period')['item_id'].nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "RWdlg9pmLadd" + }, + "outputs": [], + "source": [ + "subset_model = final_model[final_model.member_id == 'hemingway-ernest']" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 179 }, + "id": "DIh_AnSMGGbt", + "outputId": "d670a3a7-83e0-4774-86a1-846619e5425d" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 39, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "jGrhrzCdLQ7X", - "outputId": "8fe0f047-a8ba-4e1f-b13b-189c55e7533e" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "period\n", - " 1921-12-28/1922-11-08 36\n", - " 1924-03-28/1925-03-28 36\n", - "Name: item_id, dtype: int64" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
member_idsubscription_startsubscription_enditem_idmedianskewstdvarkurtosismodel_loopsscoremodel_runmember_period
7200hemingway-ernest1921-12-281922-11-08burney-evelina-history-young0.320107-0.0480810.189160.0357810.0238061000.8098430hemingway-ernest: 1921-12-28/1922-11-08
\n", + "
" ], - "source": [ - "top_results.groupby('period').item_id.nunique()" + "text/plain": [ + " member_id subscription_start subscription_end \\\n", + "7200 hemingway-ernest 1921-12-28 1922-11-08 \n", + "\n", + " item_id median skew std var \\\n", + "7200 burney-evelina-history-young 0.320107 -0.048081 0.18916 0.035781 \n", + "\n", + " kurtosis model_loops score model_run \\\n", + "7200 0.023806 100 0.809843 0 \n", + "\n", + " member_period \n", + "7200 hemingway-ernest: 1921-12-28/1922-11-08 " ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "top_results = sample_scores(subset_model, True, 36)\n", + "top_results[0:1]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "id": "D7ACefq438yi" + }, + "outputs": [], + "source": [ + "def get_formatted_titles(row):\n", + "\n", + " item = books_df[books_df.id == row.item_id]\n", + " if item.author.isna().any() == False:\n", + " author = ' '.join(item.author.str.split(',').values[0][::-1])\n", + " author = ' by' + author\n", + " else: \n", + " author = '(Periodical)'\n", + " title = item.title.values[0]\n", + " return title + author" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "id": "CGqXVIA64og0" + }, + "outputs": [], + "source": [ + "top_results['formatted_title'] = top_results.apply(get_formatted_titles, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "id": "nqiDWqem6Xw1" + }, + "outputs": [], + "source": [ + "top_results['period'] = top_results.member_period.str.split(':').str[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "zbG_i17NCNpn", + "outputId": "11ee943b-20db-47fc-f627-2e01ab4ab4da" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 40, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "05283Ys2LQiM", - "outputId": "cc89e25b-f9da-4090-ad54-443fa723bee5" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "period\n", - " 1921-12-28/1922-11-08 34\n", - " 1924-03-28/1925-03-28 29\n", - "Name: item_id, dtype: int64" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "top_results[top_results.formatted_title.str.contains('Periodical') == False].groupby('period').item_id.nunique()" + "data": { + "text/plain": [ + "(59, 52)" ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(top_results.item_id.unique()), len(top_results[top_results.formatted_title.str.contains('Periodical') == False].item_id.unique())" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "jGrhrzCdLQ7X", + "outputId": "8fe0f047-a8ba-4e1f-b13b-189c55e7533e" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 41, - "metadata": { - "id": "3J07CDIr0Xx1" - }, - "outputs": [], - "source": [ - "items = top_results[top_results.member_id == 'hemingway-ernest'].groupby(['period','item_id'])['score'].mean().reset_index(name='avg').sort_values(by='avg', ascending=False)" + "data": { + "text/plain": [ + "period\n", + "1921-12-28/1922-11-08 36\n", + "1924-03-28/1925-03-28 36\n", + "Name: item_id, dtype: int64" ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "top_results.groupby('period').item_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "05283Ys2LQiM", + "outputId": "cc89e25b-f9da-4090-ad54-443fa723bee5" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 42, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "TCIh8hqRDsu3", - "outputId": "930d1695-15b7-4bcc-c61f-9efe77d6d108" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "period\n", - " 1921-12-28/1922-11-08 36\n", - " 1924-03-28/1925-03-28 36\n", - "dtype: int64" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "items.groupby(['period']).size()" + "data": { + "text/plain": [ + "period\n", + "1921-12-28/1922-11-08 34\n", + "1924-03-28/1925-03-28 29\n", + "Name: item_id, dtype: int64" ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "top_results[top_results.formatted_title.str.contains('Periodical') == False].groupby('period').item_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "id": "3J07CDIr0Xx1" + }, + "outputs": [], + "source": [ + "items = top_results[top_results.member_id == 'hemingway-ernest'].groupby(['period','item_id'])['score'].mean().reset_index(name='avg').sort_values(by='avg', ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "TCIh8hqRDsu3", + "outputId": "930d1695-15b7-4bcc-c61f-9efe77d6d108" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 43, - "metadata": { - "id": "x6an0dLz2mcR" - }, - "outputs": [], - "source": [ - "members = top_results.member_id.unique().tolist()\n", - "charts = []\n", - "members=['hemingway-ernest']\n", - "for member in members:\n", - " full_name = members_df[members_df.id == member].name.values[0]\n", - " tickplot = alt.Chart(top_results[(top_results.member_id == member)]).mark_tick(opacity=0.7).encode(\n", - " y=alt.Y('formatted_title', sort='-x', axis=alt.Axis(title=\"Predicted Book\")),\n", - " x='score',\n", - " color=alt.Color('period:N', legend=alt.Legend(title=\"Missing Borrowing Records Period\")),\n", - " # facet='member_period:N'\n", - " ).properties(\n", - " title=f'Top Predictions by Implicit Matrix Factorization Model',\n", - " width=300\n", - " )\n", - " charts.append(tickplot)\n" + "data": { + "text/plain": [ + "period\n", + "1921-12-28/1922-11-08 36\n", + "1924-03-28/1925-03-28 36\n", + "dtype: int64" ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "items.groupby(['period']).size()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "id": "x6an0dLz2mcR" + }, + "outputs": [], + "source": [ + "members = top_results.member_id.unique().tolist()\n", + "charts = []\n", + "members=['hemingway-ernest']\n", + "for member in members:\n", + " full_name = members_df[members_df.id == member].name.values[0]\n", + " tickplot = alt.Chart(top_results[(top_results.member_id == member)]).mark_tick(opacity=0.7).encode(\n", + " y=alt.Y('formatted_title', sort='-x', axis=alt.Axis(title=\"Predicted Book\")),\n", + " x='score',\n", + " color=alt.Color('period:N', legend=alt.Legend(title=\"Missing Borrowing Records Period\")),\n", + " # facet='member_period:N'\n", + " ).properties(\n", + " title=f'Top Predictions by Implicit Matrix Factorization Model',\n", + " width=300\n", + " )\n", + " charts.append(tickplot)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 }, + "id": "1uLzE3Cs6IeL", + "outputId": "18ffe8c3-162d-4c10-dcbe-40904443e55d" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 45, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "1uLzE3Cs6IeL", - "outputId": "18ffe8c3-162d-4c10-dcbe-40904443e55d" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "\n", + "
\n", + "" ], - "source": [ - "charts[0].configure_axisY(\n", - " titleAngle=0,\n", - " titleAlign=\"left\",\n", - " titleY=-10,\n", - " titleX=-100,\n", - " labelLimit=1000\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "iFNwpGKN3eqB", - "outputId": "c2c1175c-8dcc-4c0b-bb20-afa24c87565c" - }, - "outputs": [], - "source": [ - "# alt.hconcat(*charts).configure_axisY(\n", - "# titleAngle=0,\n", - "# titleAlign=\"left\",\n", - "# titleY=-10,\n", - "# titleX=-10,\n", - "# labelLimit=1000\n", - "# )" + "text/plain": [ + "alt.Chart(...)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "mQdXDLaMGLq_" - }, - "outputs": [], - "source": [ - "# # [top_results.member_period == 'colens-fernand: 1920-04-01/1920-07-07']\n", - "# tickplot = alt.Chart(top_results).mark_tick(opacity=0.7).encode(\n", - "# y=alt.Y('item_id', sort='-x'),\n", - "# x='score',\n", - "# color='member_period:N',\n", - "# # facet='member_period:N'\n", - "# )\n", - "# tickplot" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [], - "source": [ - "top_results[top_results.formatted_title.str.contains('Periodical') == False].to_csv('./data/top_scores_lenskit_model100.csv', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": { - "id": "GY36uXxDrkhb" - }, - "outputs": [], - "source": [ - "final_model['zscore'] = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].transform(lambda x : zscore(x,ddof=1))" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": { - "id": "GyXSOx3FsXwA" - }, - "outputs": [], - "source": [ - "top_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id']).agg({'score':'max'})[['score']].reset_index()\n", - "top_scores = pd.merge(top_scores, final_model, on=top_scores.columns.tolist(), how='inner')\n", - "\n", - "top_scores = top_scores.rename(columns={'score': 'top_score', 'zscore' : 'top_zscore'})" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": { - "id": "bUKA8e8evqdZ" - }, - "outputs": [], - "source": [ - "avg_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].mean().reset_index(name='avg_score')\n", - "scores_df = pd.merge(top_scores, avg_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": { - "id": "TcnFmCfsb3vm" - }, - "outputs": [], - "source": [ - "std_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id']).agg({'score':'std'}).reset_index()\n", - "std_scores = std_scores.rename(columns={'score': 'std_score'})\n", - "scores_df = pd.merge(scores_df, std_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": { - "id": "OuXdNkW2-Qje" - }, - "outputs": [], - "source": [ - "median_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].median().reset_index(name='median_score')\n", - "scores_df = pd.merge(scores_df, median_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": { - "id": "8JYQlLyL1Exy" - }, - "outputs": [], - "source": [ - "import scipy\n", - "mode_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].agg(lambda x: scipy.stats.mode(x)[0][0]).reset_index()\n", - "mode_scores = pd.merge(mode_scores, final_model, on=mode_scores.columns.tolist(), how='inner')\n", - "mode_scores = mode_scores.rename(columns={'score': 'mode_score', 'zscore' : 'mode_zscore'})" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": { - "id": "VUA6CLNT70pn" - }, - "outputs": [], - "source": [ - "final_scores = pd.merge(mode_scores[['member_id', 'subscription_start', 'subscription_end', 'item_id', 'mode_score', 'mode_zscore']], scores_df, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "qfaWBviL9E75" - }, - "outputs": [], - "source": [ - "# for index, group in member_subscriptions.iterrows():\n", - "# print(group.to_dict())\n", - "# rows = final_scores[(final_scores.member_id == group.member_id) & (final_scores.subscription_start == group.subscription_start) & (final_scores.subscription_end == group.subscription_end)]\n", - "# print('top_scores:', rows.sort_values(by=['top_score'], ascending=False)[0:5][['item_id', 'top_score']].to_dict())\n", - "# print('avg_scores:', rows.sort_values(by=['avg_score'], ascending=False)[0:5][['item_id', 'avg_score']].to_dict())\n", - "# print('mode_scores:', rows.sort_values(by=['mode_score'], ascending=False)[0:5][['item_id', 'mode_score']].to_dict())" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": { - "id": "k_bFoYYh-ZR7" - }, - "outputs": [], - "source": [ - "final_scores.to_csv(f'./data/collapsed_lenskit_model{str(final_run[0])}_scores.csv', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" } - ], - "metadata": { + ], + "source": [ + "charts[0].configure_axisY(\n", + " titleAngle=0,\n", + " titleAlign=\"left\",\n", + " titleY=-10,\n", + " titleX=-100,\n", + " labelLimit=1000\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { "colab": { - "collapsed_sections": [], - "name": "lenskit_model_scores_stability.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" + "base_uri": "https://localhost:8080/", + "height": 1000 }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.18" + "id": "iFNwpGKN3eqB", + "outputId": "c2c1175c-8dcc-4c0b-bb20-afa24c87565c" + }, + "outputs": [], + "source": [ + "# alt.hconcat(*charts).configure_axisY(\n", + "# titleAngle=0,\n", + "# titleAlign=\"left\",\n", + "# titleY=-10,\n", + "# titleX=-10,\n", + "# labelLimit=1000\n", + "# )" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "id": "mQdXDLaMGLq_" + }, + "outputs": [], + "source": [ + "# # [top_results.member_period == 'colens-fernand: 1920-04-01/1920-07-07']\n", + "# tickplot = alt.Chart(top_results).mark_tick(opacity=0.7).encode(\n", + "# y=alt.Y('item_id', sort='-x'),\n", + "# x='score',\n", + "# color='member_period:N',\n", + "# # facet='member_period:N'\n", + "# )\n", + "# tickplot" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "top_results[top_results.formatted_title.str.contains('Periodical') == False].to_csv('./data/top_scores_lenskit_model100.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "id": "GY36uXxDrkhb" + }, + "outputs": [], + "source": [ + "final_model['zscore'] = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].transform(lambda x : zscore(x,ddof=1))" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "id": "GyXSOx3FsXwA" + }, + "outputs": [], + "source": [ + "top_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id']).agg({'score':'max'})[['score']].reset_index()\n", + "top_scores = pd.merge(top_scores, final_model, on=top_scores.columns.tolist(), how='inner')\n", + "\n", + "top_scores = top_scores.rename(columns={'score': 'top_score', 'zscore' : 'top_zscore'})" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "id": "bUKA8e8evqdZ" + }, + "outputs": [], + "source": [ + "avg_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].mean().reset_index(name='avg_score')\n", + "scores_df = pd.merge(top_scores, avg_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "id": "TcnFmCfsb3vm" + }, + "outputs": [], + "source": [ + "std_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id']).agg({'score':'std'}).reset_index()\n", + "std_scores = std_scores.rename(columns={'score': 'std_score'})\n", + "scores_df = pd.merge(scores_df, std_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "id": "OuXdNkW2-Qje" + }, + "outputs": [], + "source": [ + "median_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].median().reset_index(name='median_score')\n", + "scores_df = pd.merge(scores_df, median_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "id": "8JYQlLyL1Exy" + }, + "outputs": [], + "source": [ + "import scipy\n", + "mode_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].agg(lambda x: scipy.stats.mode(x)[0]).reset_index()\n", + "mode_scores = pd.merge(mode_scores, final_model, on=mode_scores.columns.tolist(), how='inner')\n", + "mode_scores = mode_scores.rename(columns={'score': 'mode_score', 'zscore' : 'mode_zscore'})" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "id": "VUA6CLNT70pn" + }, + "outputs": [], + "source": [ + "final_scores = pd.merge(mode_scores[['member_id', 'subscription_start', 'subscription_end', 'item_id', 'mode_score', 'mode_zscore']], scores_df, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": { + "id": "qfaWBviL9E75" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Predictions for hemingway-ernest, subscription 1921-12-28 to 1922-11-08\n", + "\n", + "Top scores:\n", + " item_id top_score\n", + " joyce-exiles 1.326223\n", + "mansfield-bliss-short-stories 1.183781\n", + " joyce-portrait-artist-young 1.162345\n", + " oneill-beyond-horizon 1.095982\n", + " cather-antonia 1.093625\n", + "\n", + "Average scores:\n", + " item_id avg_score\n", + " oneill-beyond-horizon 0.546617\n", + " conrad-shadow-line-confession 0.510644\n", + "twain-adventures-huckleberry-finn 0.421426\n", + " morris-specimens-early-english 0.402074\n", + " eastman-enjoyment-poetry 0.373157\n", + "\n", + "Mode scores:\n", + " item_id mode_score\n", + "morris-specimens-early-english 0.073113\n", + " conrad-shadow-line-confession 0.069347\n", + " schreiner-story-african-farm 0.064250\n", + " frank-rahab 0.058457\n", + " saltus-paliser-case 0.021517\n", + "\n", + "Predictions for hemingway-ernest, subscription 1924-03-28 to 1925-03-28\n", + "\n", + "Top scores:\n", + " item_id top_score\n", + " joyce-exiles 1.326223\n", + "mansfield-bliss-short-stories 1.183781\n", + " joyce-portrait-artist-young 1.162345\n", + " forster-howards-end 1.141532\n", + " melville-moby-dick-whale 1.139511\n", + "\n", + "Average scores:\n", + " item_id avg_score\n", + " oneill-hairy-ape 0.590482\n", + " criterion 0.587726\n", + " forster-howards-end 0.572118\n", + " oneill-beyond-horizon 0.546617\n", + "conrad-shadow-line-confession 0.510644\n", + "\n", + "Mode scores:\n", + " item_id mode_score\n", + " ponsonby-english-diaries 0.243522\n", + " leskov-sentry-stories 0.242126\n", + " meredith-amazing-marriage 0.183322\n", + " machen-house-souls 0.093522\n", + "saintsbury-collected-essays-papers 0.077510\n" + ] } + ], + "source": [ + "for index, group in member_subscriptions.iterrows():\n", + " print(\"\\nPredictions for %(member_id)s, subscription %(subscription_start)s to %(subscription_end)s\" %\n", + " group.to_dict())\n", + " rows = final_scores[(final_scores.member_id == group.member_id) & (final_scores.subscription_start == group.subscription_start) & (final_scores.subscription_end == group.subscription_end)]\n", + " top_scores = rows.sort_values(by=['top_score'], ascending=False)[0:5][['item_id', 'top_score']]\n", + " print(\"\\nTop scores:\")\n", + " print(top_scores.to_string(index=False))\n", + " avg_scores = rows.sort_values(by=['avg_score'], ascending=False)[0:5][['item_id', 'avg_score']]\n", + " print(\"\\nAverage scores:\")\n", + " print(avg_scores.to_string(index=False))\n", + " mode_scores = rows.sort_values(by=['mode_score'], ascending=False)[0:5][['item_id', 'mode_score']]\n", + " print(\"\\nMode scores:\")\n", + " print(mode_scores.to_string(index=False))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "id": "k_bFoYYh-ZR7" + }, + "outputs": [], + "source": [ + "final_scores.to_csv(f'./data/collapsed_lenskit_model{str(final_run[0])}_scores.csv', index=False)" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "lenskit_model_scores_stability.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } diff --git a/tests/test_utils/test_data.py b/tests/test_utils/test_data.py index 643d465..a840ef8 100644 --- a/tests/test_utils/test_data.py +++ b/tests/test_utils/test_data.py @@ -35,18 +35,18 @@ def test_load_initial_data(): @patch("utils.missing_data_processing.pd") @patch("utils.missing_data_processing.preprocess_events_data") -@patch("utils.missing_data_processing.preprocess_books_data") -def test_get_preprocessed_data(mock_preprocess_books, mock_preprocess_events, mock_pd): +@patch("utils.missing_data_processing.preprocess_shxco_data") +def test_get_preprocessed_data(mock_preprocess_shxco, mock_preprocess_events, mock_pd): # no datasets specified: should return all data = missing_data_processing.get_preprocessed_data() for dataset in missing_data_processing.CSV_PATHS.keys(): assert dataset in data assert mock_pd.read_csv.call_count == 4 mock_preprocess_events.assert_called() - mock_preprocess_books.assert_called() + mock_preprocess_shxco.assert_called() # reset mocks - for m in [mock_preprocess_books, mock_preprocess_events, mock_pd]: + for m in [mock_preprocess_shxco, mock_preprocess_events, mock_pd]: m.reset_mock() # test loading selected datasets @@ -55,7 +55,7 @@ def test_get_preprocessed_data(mock_preprocess_books, mock_preprocess_events, mo assert "books" in data assert "borrow_overrides" in data mock_preprocess_events.assert_not_called() - mock_preprocess_books.assert_called() + mock_preprocess_shxco.assert_called() # test unknown dataset with pytest.raises(ValueError): diff --git a/utils/missing_data_processing.py b/utils/missing_data_processing.py index 036ecb4..586f271 100644 --- a/utils/missing_data_processing.py +++ b/utils/missing_data_processing.py @@ -91,7 +91,9 @@ def get_preprocessed_data(*datasets) -> Dict[str, pd.DataFrame]: if "events" in datasets: data["events"] = preprocess_events_data(data["events"]) if "books" in datasets: - data["books"] = preprocess_books_data(data["books"]) + data["books"] = preprocess_shxco_data(data["books"]) + if "members" in datasets: + data["members"] = preprocess_shxco_data(data["members"]) return data @@ -131,24 +133,24 @@ def preprocess_events_data(events_df: pd.DataFrame) -> pd.DataFrame: return events_df -def preprocess_books_data(books_df: pd.DataFrame) -> pd.DataFrame: +def preprocess_shxco_data(df: pd.DataFrame) -> pd.DataFrame: """ - Pre-processing for book data. + Pre-processing for book or member data. - This function processes the 'books' data by generating short-form IDs - from the longer project URIs. + This function processes the 'books' or 'members' data by generating + short-form IDs from the longer project URIs. Args: - books_df (pd.DataFrame): The initial 'books' DataFrame. + df (pd.DataFrame): The initial 'books' or 'members' DataFrame. Returns: - pd.DataFrame: The processed 'books' DataFrame. + pd.DataFrame: processed 'books' or 'members' DataFrame. """ # Generate short IDs from item URIs - books_df["id"] = books_df.uri.apply(short_id) + df["id"] = df.uri.apply(short_id) - # Return the processed 'books' DataFrame. - return books_df + # Return the processed 'DataFrame. + return df def get_logbook_events(events_df: pd.DataFrame) -> pd.DataFrame: From 436e710ed0070d07954e4c76d650e16b661dceb0 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Mon, 6 May 2024 16:29:52 -0400 Subject: [PATCH 2/2] Add lenskit to python requirements; update requirements.lock --- requirements.lock | 4 ++++ requirements.txt | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/requirements.lock b/requirements.lock index 763538a..bbcb163 100644 --- a/requirements.lock +++ b/requirements.lock @@ -57,6 +57,7 @@ ipywidgets==8.1.2 isoduration==20.11.0 jedi==0.19.1 Jinja2==3.1.2 +joblib==1.4.2 json5==0.9.24 jsonpointer==2.4 jsonschema==4.21.1 @@ -101,6 +102,7 @@ parso==0.8.4 pexpect==4.9.0 pillow==10.3.0 platformdirs==4.2.0 +plotly==5.22.0 pluggy==1.5.0 portpicker==1.5.2 powerlaw==1.5 @@ -132,6 +134,7 @@ requests==2.31.0 rfc3339-validator==0.1.4 rfc3986-validator==0.1.1 rpds-py==0.18.0 +scikit-learn==1.4.2 scipy==1.13.0 seaborn==0.11.0 seedbank==0.1.3 @@ -145,6 +148,7 @@ stack-data==0.6.3 stanio==0.5.0 tenacity==8.2.3 terminado==0.18.1 +threadpoolctl==3.5.0 tinycss2==1.2.1 tomli==2.0.1 toolz==0.12.0 diff --git a/requirements.txt b/requirements.txt index a3c63dd..3e373ab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ great-tables # we used an unreleased version of copia with label options for plots git+https://github.com/mikekestemont/copia@3e57da4 matplotlib==3.7 -scikit-learn \ No newline at end of file +scikit-learn +lenskit \ No newline at end of file