fix: Replace deprecated dataset in tutorials and docs (#2462)

Signed-off-by: Anton Kukushkin <[email protected]>
aws · Sep 15, 2023 · 1842da8 · 1842da8
1 parent 67d40c1
commit 1842da8
Show file tree

Hide file tree

Showing 4 changed files with 215 additions and 775 deletions.
diff --git a/docs/source/scale.rst b/docs/source/scale.rst
@@ -37,18 +37,16 @@ In distributed mode, the same ``awswrangler`` APIs can now handle much larger da
 
 .. code-block:: python
 
-    # Read Parquet data (1.2 Gb Parquet compressed)
-    df = wr.s3.read_parquet(
-        path=f"s3://amazon-reviews-pds/parquet/product_category=Toys/",
-    )
+    # Read 1.6 Gb Parquet data
+    df = wr.s3.read_parquet(path="s3://ursa-labs-taxi-data/2017/")
 
-    # Drop the customer_id column
-    df.drop("customer_id", axis=1, inplace=True)
+    # Drop vendor_id column
+    df.drop("vendor_id", axis=1, inplace=True)
 
-    # Filter reviews with 5-star rating
-    df5 = df[df["star_rating"] == 5]
+    # Filter trips over 1 mile
+    df1 = df[df["trip_distance"] > 1]
 
-In the example above, Amazon product data is read from Amazon S3 into a distributed `Modin data frame <https://modin.readthedocs.io/en/stable/getting_started/why_modin/pandas.html>`_.
+In the example above, New York City Taxi data is read from Amazon S3 into a distributed `Modin data frame <https://modin.readthedocs.io/en/stable/getting_started/why_modin/pandas.html>`_.
 Modin is a drop-in replacement for Pandas. It exposes the same APIs but enables you to use all of the cores on your machine, or all of the workers in an entire cluster, leading to improved performance and scale.
 To use it, make sure to replace your pandas import statement with modin:
 

diff --git a/tests/glue_scripts/wrangler_blog_simple.py b/tests/glue_scripts/wrangler_blog_simple.py
@@ -13,7 +13,7 @@
 # Drop vendor_id column
 df.drop("vendor_id", axis=1, inplace=True)
 
-# Filter trips with 1 passenger
+# Filter trips over 1 mile
 df1 = df[df["trip_distance"] > 1]
 
 # Write partitioned trips to S3 in Parquet format

diff --git a/tutorials/029 - S3 Select.ipynb b/tutorials/029 - S3 Select.ipynb
@@ -2,21 +2,33 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
    "source": [
     "[![AWS SDK for pandas](_static/logo.png \"AWS SDK for pandas\")](https://github.com/aws/aws-sdk-pandas)"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
    "source": [
     "# 29 - S3 Select"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
    "source": [
     "AWS SDK for pandas supports [Amazon S3 Select](https://aws.amazon.com/blogs/aws/s3-glacier-select/), enabling applications to use SQL statements in order to query and filter the contents of a single S3 object. It works on objects stored in CSV, JSON or Apache Parquet, including compressed and large files of several TBs.\n",
     "\n",
@@ -32,172 +44,28 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
    "source": [
     "## Read multiple Parquet files from an S3 prefix"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
    "outputs": [
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>marketplace</th>\n",
-       "      <th>customer_id</th>\n",
-       "      <th>review_id</th>\n",
-       "      <th>product_id</th>\n",
-       "      <th>product_parent</th>\n",
-       "      <th>star_rating</th>\n",
-       "      <th>helpful_votes</th>\n",
-       "      <th>total_votes</th>\n",
-       "      <th>vine</th>\n",
-       "      <th>verified_purchase</th>\n",
-       "      <th>review_headline</th>\n",
-       "      <th>review_body</th>\n",
-       "      <th>review_date</th>\n",
-       "      <th>year</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>US</td>\n",
-       "      <td>52670295</td>\n",
-       "      <td>RGPOFKORD8RTU</td>\n",
-       "      <td>B0002CZPPG</td>\n",
-       "      <td>867256265</td>\n",
-       "      <td>5</td>\n",
-       "      <td>105</td>\n",
-       "      <td>107</td>\n",
-       "      <td>N</td>\n",
-       "      <td>N</td>\n",
-       "      <td>Excellent Gift Idea</td>\n",
-       "      <td>I wonder if the other reviewer actually read t...</td>\n",
-       "      <td>2005-02-08</td>\n",
-       "      <td>2005</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>US</td>\n",
-       "      <td>29964102</td>\n",
-       "      <td>R2U8X8V5KPB4J3</td>\n",
-       "      <td>B00H5BMF00</td>\n",
-       "      <td>373287760</td>\n",
-       "      <td>5</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>N</td>\n",
-       "      <td>Y</td>\n",
-       "      <td>Five Stars</td>\n",
-       "      <td>convenience is the name of the game.</td>\n",
-       "      <td>2015-05-03</td>\n",
-       "      <td>2015</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>US</td>\n",
-       "      <td>25173351</td>\n",
-       "      <td>R15XV3LXUMLTXL</td>\n",
-       "      <td>B00PG40CO4</td>\n",
-       "      <td>137115061</td>\n",
-       "      <td>5</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>N</td>\n",
-       "      <td>Y</td>\n",
-       "      <td>Birthday Gift</td>\n",
-       "      <td>This gift card was handled with accuracy in de...</td>\n",
-       "      <td>2015-05-03</td>\n",
-       "      <td>2015</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>US</td>\n",
-       "      <td>12516181</td>\n",
-       "      <td>R3G6G7H8TX4H0T</td>\n",
-       "      <td>B0002CZPPG</td>\n",
-       "      <td>867256265</td>\n",
-       "      <td>5</td>\n",
-       "      <td>6</td>\n",
-       "      <td>6</td>\n",
-       "      <td>N</td>\n",
-       "      <td>N</td>\n",
-       "      <td>Love 'em.</td>\n",
-       "      <td>Gotta love these iTunes Prepaid Card thingys. ...</td>\n",
-       "      <td>2005-10-15</td>\n",
-       "      <td>2005</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>US</td>\n",
-       "      <td>38355314</td>\n",
-       "      <td>R2NJ7WNBU16YTQ</td>\n",
-       "      <td>B00B2TFSO6</td>\n",
-       "      <td>89375983</td>\n",
-       "      <td>5</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>N</td>\n",
-       "      <td>Y</td>\n",
-       "      <td>Five Stars</td>\n",
-       "      <td>perfect</td>\n",
-       "      <td>2015-05-03</td>\n",
-       "      <td>2015</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "  marketplace customer_id       review_id  product_id product_parent  \\\n",
-       "0          US    52670295   RGPOFKORD8RTU  B0002CZPPG      867256265   \n",
-       "1          US    29964102  R2U8X8V5KPB4J3  B00H5BMF00      373287760   \n",
-       "2          US    25173351  R15XV3LXUMLTXL  B00PG40CO4      137115061   \n",
-       "3          US    12516181  R3G6G7H8TX4H0T  B0002CZPPG      867256265   \n",
-       "4          US    38355314  R2NJ7WNBU16YTQ  B00B2TFSO6       89375983   \n",
-       "\n",
-       "   star_rating  helpful_votes  total_votes vine verified_purchase  \\\n",
-       "0            5            105          107    N                 N   \n",
-       "1            5              0            0    N                 Y   \n",
-       "2            5              0            0    N                 Y   \n",
-       "3            5              6            6    N                 N   \n",
-       "4            5              0            0    N                 Y   \n",
-       "\n",
-       "       review_headline                                        review_body  \\\n",
-       "0  Excellent Gift Idea  I wonder if the other reviewer actually read t...   \n",
-       "1           Five Stars               convenience is the name of the game.   \n",
-       "2        Birthday Gift  This gift card was handled with accuracy in de...   \n",
-       "3            Love 'em.  Gotta love these iTunes Prepaid Card thingys. ...   \n",
-       "4           Five Stars                                            perfect   \n",
-       "\n",
-       "  review_date  year  \n",
-       "0  2005-02-08  2005  \n",
-       "1  2015-05-03  2015  \n",
-       "2  2015-05-03  2015  \n",
-       "3  2005-10-15  2005  \n",
-       "4  2015-05-03  2015  "
-      ]
+      "text/plain": "  vendor_id                 pickup_at                dropoff_at  \\\n0         2  2019-01-01T00:48:10.000Z  2019-01-01T01:36:58.000Z   \n1         2  2019-01-01T00:38:36.000Z  2019-01-01T01:21:33.000Z   \n2         2  2019-01-01T00:10:43.000Z  2019-01-01T01:23:59.000Z   \n3         1  2019-01-01T00:13:17.000Z  2019-01-01T01:06:13.000Z   \n4         2  2019-01-01T00:29:11.000Z  2019-01-01T01:29:05.000Z   \n\n   passenger_count  trip_distance rate_code_id store_and_fwd_flag  \\\n0                1      31.570000            1                  N   \n1                2      33.189999            5                  N   \n2                1      33.060001            1                  N   \n3                1      44.099998            5                  N   \n4                2      31.100000            1                  N   \n\n   pickup_location_id  dropoff_location_id payment_type  fare_amount  extra  \\\n0                 138                  138            2         82.5    0.5   \n1                 107                  265            1        121.0    0.0   \n2                 243                   42            2         92.0    0.5   \n3                 132                  265            2        150.0    0.0   \n4                 169                  201            1         85.5    0.5   \n\n   mta_tax  tip_amount  tolls_amount  improvement_surcharge  total_amount  \\\n0      0.5        0.00          0.00                    0.3     83.800003   \n1      0.0        0.08         10.50                    0.3    131.880005   \n2      0.5        0.00          5.76                    0.3     99.059998   \n3      0.0        0.00          0.00                    0.3    150.300003   \n4      0.5        0.00          7.92                    0.3     94.720001   \n\n   congestion_surcharge  \n0                   NaN  \n1                   NaN  \n2                   NaN  \n3                   NaN  \n4                   NaN  ",
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>vendor_id</th>\n      <th>pickup_at</th>\n      <th>dropoff_at</th>\n      <th>passenger_count</th>\n      <th>trip_distance</th>\n      <th>rate_code_id</th>\n      <th>store_and_fwd_flag</th>\n      <th>pickup_location_id</th>\n      <th>dropoff_location_id</th>\n      <th>payment_type</th>\n      <th>fare_amount</th>\n      <th>extra</th>\n      <th>mta_tax</th>\n      <th>tip_amount</th>\n      <th>tolls_amount</th>\n      <th>improvement_surcharge</th>\n      <th>total_amount</th>\n      <th>congestion_surcharge</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>2</td>\n      <td>2019-01-01T00:48:10.000Z</td>\n      <td>2019-01-01T01:36:58.000Z</td>\n      <td>1</td>\n      <td>31.570000</td>\n      <td>1</td>\n      <td>N</td>\n      <td>138</td>\n      <td>138</td>\n      <td>2</td>\n      <td>82.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.00</td>\n      <td>0.00</td>\n      <td>0.3</td>\n      <td>83.800003</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2</td>\n      <td>2019-01-01T00:38:36.000Z</td>\n      <td>2019-01-01T01:21:33.000Z</td>\n      <td>2</td>\n      <td>33.189999</td>\n      <td>5</td>\n      <td>N</td>\n      <td>107</td>\n      <td>265</td>\n      <td>1</td>\n      <td>121.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.08</td>\n      <td>10.50</td>\n      <td>0.3</td>\n      <td>131.880005</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>2019-01-01T00:10:43.000Z</td>\n      <td>2019-01-01T01:23:59.000Z</td>\n      <td>1</td>\n      <td>33.060001</td>\n      <td>1</td>\n      <td>N</td>\n      <td>243</td>\n      <td>42</td>\n      <td>2</td>\n      <td>92.0</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.00</td>\n      <td>5.76</td>\n      <td>0.3</td>\n      <td>99.059998</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>1</td>\n      <td>2019-01-01T00:13:17.000Z</td>\n      <td>2019-01-01T01:06:13.000Z</td>\n      <td>1</td>\n      <td>44.099998</td>\n      <td>5</td>\n      <td>N</td>\n      <td>132</td>\n      <td>265</td>\n      <td>2</td>\n      <td>150.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.00</td>\n      <td>0.00</td>\n      <td>0.3</td>\n      <td>150.300003</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>2</td>\n      <td>2019-01-01T00:29:11.000Z</td>\n      <td>2019-01-01T01:29:05.000Z</td>\n      <td>2</td>\n      <td>31.100000</td>\n      <td>1</td>\n      <td>N</td>\n      <td>169</td>\n      <td>201</td>\n      <td>1</td>\n      <td>85.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.00</td>\n      <td>7.92</td>\n      <td>0.3</td>\n      <td>94.720001</td>\n      <td>NaN</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
      },
      "execution_count": 1,
      "metadata": {},
@@ -208,25 +76,34 @@
     "import awswrangler as wr\n",
     "\n",
     "df = wr.s3.select_query(\n",
-    "    sql=\"SELECT * FROM s3object s where s.\\\"star_rating\\\" >= 5\",\n",
-    "    path=\"s3://amazon-reviews-pds/parquet/product_category=Gift_Card/\",\n",
+    "    sql=\"SELECT * FROM s3object s where s.\\\"trip_distance\\\" > 30\",\n",
+    "    path=\"s3://ursa-labs-taxi-data/2019/01/\",\n",
     "    input_serialization=\"Parquet\",\n",
     "    input_serialization_params={},\n",
     ")\n",
-    "df.loc[:, df.columns != \"product_title\"].head()"
+    "\n",
+    "df.head()"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
    "source": [
     "## Read full CSV file"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
    "outputs": [
     {
      "data": {
@@ -340,15 +217,23 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
    "source": [
     "## Filter JSON file"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
    "outputs": [
     {
      "data": {
@@ -468,4 +353,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}