Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add s3 provider #1009

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,11 @@ docker run -p 8080:8080 -e 'GITHUB_OAUTH_KEY=YOURKEY' \
Or to use your GitHub personal access token, you can just set `GITHUB_API_TOKEN`.


## S3 buckets
Files in S3 buckets can be access by their s3 uri like `s3://bucket/path/to/key`. This works directly for public buckets. If you want to access private buckets, you need to provide the s3 authentication credentials to the docker container or in your environment.
For the docker container this can be done by setting the [environment variables](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#environment-variables) with `-e AWS_ACCESS_KEY_ID=my_secret_id -e AWS_SECRET_ACCESS_KEY=my_secret_key`.
Or you can provide the [shared credentials file](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#shared-credentials-file) to the user running the nbviewer (in docker with a volume).

## GitHub Enterprise

To use nbviewer on your own GitHub Enterprise instance you need to set `GITHUB_API_URL`.
Expand Down
5 changes: 5 additions & 0 deletions nbviewer/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,10 @@ class NBViewer(Application):
default_value="nbviewer.providers.local.handlers.LocalFileHandler",
help="The Tornado handler to use for viewing notebooks found on a local filesystem",
).tag(config=True)
s3_handler = Unicode(
default_value="nbviewer.providers.s3.handlers.S3Handler",
help="The Tornado handler to use for viewing notebooks from amazon S3",
).tag(config=True)
url_handler = Unicode(
default_value="nbviewer.providers.url.handlers.URLHandler",
help="The Tornado handler to use for viewing notebooks accessed via URL",
Expand Down Expand Up @@ -625,6 +629,7 @@ def init_tornado_application(self):
github_user_handler=self.github_user_handler,
index_handler=self.index_handler,
local_handler=self.local_handler,
s3_handler=self.s3_handler,
url_handler=self.url_handler,
user_gists_handler=self.user_gists_handler,
)
Expand Down
6 changes: 3 additions & 3 deletions nbviewer/providers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
# -----------------------------------------------------------------------------

default_providers = [
"nbviewer.providers.{}".format(prov) for prov in ["url", "github", "gist"]
"nbviewer.providers.{}".format(prov) for prov in ["url", "github", "gist", "s3"]
]

default_rewrites = [
"nbviewer.providers.{}".format(prov)
for prov in ["gist", "github", "dropbox", "huggingface", "url"]
for prov in ["gist", "github", "dropbox", "huggingface", "s3", "url"]
]


Expand Down Expand Up @@ -83,7 +83,7 @@ def _load_provider_feature(feature, providers, **handler_names):
try:
# Ex: handler_names['url_handler']
handler_names[provider_handler_key]
except KeyError:
except KeyError as e:
continue
else:
# Ex: provider_handlers['url_handler'] = handler_names['url_handler']
Expand Down
3 changes: 3 additions & 0 deletions nbviewer/providers/s3/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .handlers import default_handlers
from .handlers import S3Handler
from .handlers import uri_rewrites
149 changes: 149 additions & 0 deletions nbviewer/providers/s3/handlers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
# -----------------------------------------------------------------------------
# Copyright (C) Jupyter Development Team
#
# Distributed under the terms of the BSD License. The full license is in
# the file COPYING, distributed as part of this software.
# -----------------------------------------------------------------------------
import errno
import io
import os
from datetime import datetime
from urllib.parse import urlparse

import boto3
import botocore
from tornado import iostream
from tornado import web

from .. import _load_handler_from_location
from ...utils import url_path_join
from ..base import cached
from ..base import RenderingHandler


class S3Handler(RenderingHandler):
"""Renderer for s3://

Serving notebooks from S3 buckets
"""

def initialize(self, **kwargs):
self.s3_client = boto3.client("s3")
self._downloadable_data = None
self._downloaded_path = None
super().initialize(**kwargs)

async def download(self, path):
"""Download the notebook"""
headers = await self.get_notebook_headers(path)
filename = os.path.basename(path)
self.set_header("Content-Length", headers["ContentLength"])
# Escape commas to workaround Chrome issue with commas in download filenames
self.set_header(
"Content-Disposition",
"attachment; filename={};".format(filename.replace(",", "_")),
)
if self._downloaded_path == path and self._downloadable_data is not None:
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wanted to prevent re-fetching the file from s3. Maybe there is a beter way to cache the boto3 s3 transfer using memcached somehow. Any suggestions are welcome.

content = self._downloadable_data
else:
content = await self.read_s3_file(path)

if isinstance(content, bytes):
content = [content]
for chunk in content:
try:
self.write(chunk)
await self.flush()
except iostream.StreamClosedError:
return

async def get_notebook_data(self, path):
"""Get additional notebook data"""
is_download = self.get_query_arguments("download")
if is_download:
await self.download(path)
return

return path

async def get_notebook_headers(self, path):
"""Get the size of a notebook file."""
o = urlparse(path)
bucket = o.netloc
key = o.path[1:]
self.log.debug("Getting headers for %s from %s", key, bucket)
try:
head = self.s3_client.head_object(Bucket=bucket, Key=key)
except botocore.exceptions.ClientError as ex:
if ex.response["Error"]["Code"] == "404":
self.log.info("The notebook %s does not exist.", path)
raise web.HTTPError(404)
raise ex
return head

async def read_s3_file(self, path):
"""Download the notebook file from s3."""
o = urlparse(path)
bucket = o.netloc
key = o.path[1:]
s3_file = io.BytesIO()
self.log.debug("Reading %s from %s", key, bucket)
try:
self.s3_client.download_fileobj(bucket, key, s3_file)
except botocore.exceptions.ClientError as ex:
if ex.response["Error"]["Code"] == "404":
self.log.info("The notebook %s does not exist.", path)
raise web.HTTPError(404)
raise ex
s3_file.seek(0)
self.log.debug("Done downloading.")
self._downloadable_data = s3_file.read().decode("utf-8")
self._downloaded_path = path
return self._downloadable_data

async def deliver_notebook(self, path):
nbdata = await self.read_s3_file(path)

# Explanation of some kwargs passed into `finish_notebook`:
# breadcrumbs: list of dict
# Breadcrumb 'name' and 'url' to render as links at the top of the notebook page
# title: str
# Title to use as the HTML page title (i.e., text on the browser tab)
await self.finish_notebook(
nbdata,
download_url="?download",
msg="file from s3: %s" % path,
public=False,
breadcrumbs=[],
title=os.path.basename(path),
)

@cached
async def get(self, path):
"""Get an s3 notebook

Parameters
==========
path: str
s3 uri
"""
fullpath = await self.get_notebook_data(path)

# get_notebook_data returns None if a directory is to be shown or a notebook is to be downloaded,
# i.e. if no notebook is supposed to be rendered, making deliver_notebook inappropriate
if fullpath is not None:
await self.deliver_notebook(fullpath)


def default_handlers(handlers=[], **handler_names):
"""Tornado handlers"""

s3_handler = _load_handler_from_location(handler_names["s3_handler"])

return handlers + [(r"/(s3%3A//.*)", s3_handler, {})]


def uri_rewrites(rewrites=[]):
return [
(r"^(s3://.*)$", "{0}"),
]
Empty file.
96 changes: 96 additions & 0 deletions nbviewer/providers/s3/tests/test_s3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# -----------------------------------------------------------------------------
# Copyright (C) Jupyter Development Team
#
# Distributed under the terms of the BSD License. The full license is in
# the file COPYING, distributed as part of this software.
# -----------------------------------------------------------------------------
import io
import json
from copy import deepcopy
from unittest.mock import patch

import boto3
import requests

from ....tests.base import FormatHTMLMixin
from ....tests.base import NBViewerTestCase


MOCK_NOTEBOOK = {
"cells": [
{
"cell_type": "code",
"execution_count": None,
"id": "b0939771-a810-4ee0-b440-dbbaeb4f1653",
"metadata": {},
"outputs": [],
"source": [],
},
{
"cell_type": "code",
"execution_count": None,
"id": "cc0d476a-d09c-4919-8dd2-c8d67f7431b3",
"metadata": {},
"outputs": [],
"source": [],
},
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3",
},
"language_info": {
"codemirror_mode": {"name": "ipython", "version": 3},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12",
},
},
"nbformat": 4,
"nbformat_minor": 5,
}


class MockBoto3:
def download_fileobj(self, Bucket, Key, fileobj):
"""Mock downloading fileobjects"""
data = deepcopy(MOCK_NOTEBOOK)
data["cells"][0]["source"] = [f"print({Bucket})", f"print({Key})"]
bin_data = json.dumps(data).encode("utf-8")
fileobj.write(bin_data)

def head_object(self, Bucket, Key):
"""Mock getting key headers"""
output_file = io.BytesIO()
f = self.download_fileobj(Bucket, Key, output_file)
f.seek(0)
return {"ContentLength": len(f.read())}


"""
# This test won't work because the server is started through subprocess.POpen, so we can't mock boto3.

class S3TestCase(NBViewerTestCase):

@patch("boto3.client")
def test_url(self, mock_boto3_client):
mockBoto3 = MockBoto3()
mock_boto3_client.return_value = mockBoto3
with patch.object(mockBoto3, 'download_fileobj') as mock_download:
bucket="my_bucket"
key="my_file.ipynb"
url = self.url(f"s3%3A//{bucket}/{key}")
r = requests.get(url)
self.assertEqual(r.status_code, 200)
args = mock_download.call_args_list[-1][:2]
self.assertEqual(args, (bucket, key))


class FormatHTMLLocalFileDefaultTestCase(S3TestCase, FormatHTMLMixin):
pass
"""
1 change: 1 addition & 0 deletions requirements.in
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
elasticsearch
ipython>=8
boto3
jupyter_client
jupyter_server>=0.2.0
markdown>=3.0,==3.1.1 # pin until we workaround #909, which is a regression in 3.2
Expand Down
20 changes: 18 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ beautifulsoup4==4.11.1
# via nbconvert
bleach==5.0.1
# via nbconvert
boto3==1.23.3
# via -r requirements.in
botocore==1.26.3
# via
# boto3
# s3transfer
certifi==2022.12.7
# via elastic-transport
cffi==1.15.1
Expand Down Expand Up @@ -48,6 +54,10 @@ jinja2==3.1.2
# via
# jupyter-server
# nbconvert
jmespath==1.0.0
# via
# boto3
# botocore
jsonschema==4.17.0
# via nbformat
jupyter-client==7.4.4
Expand Down Expand Up @@ -130,11 +140,15 @@ pyparsing==3.0.9
pyrsistent==0.19.2
# via jsonschema
python-dateutil==2.8.2
# via jupyter-client
# via
# botocore
# jupyter-client
pyzmq==24.0.1
# via
# jupyter-client
# jupyter-server
s3transfer==0.5.2
# via boto3
send2trash==1.8.0
# via jupyter-server
six==1.16.0
Expand Down Expand Up @@ -171,7 +185,9 @@ traitlets==5.5.0
# nbconvert
# nbformat
urllib3==1.26.12
# via elastic-transport
# via
# botocore
# elastic-transport
wcwidth==0.2.5
# via prompt-toolkit
webencodings==0.5.1
Expand Down