"""Interface to the New York Times APIs."""
from collections import Counter
from datetime import datetime, timedelta
import dateutil
import isbnlib
import os
import json
import logging
from sqlalchemy.orm.session import Session
from sqlalchemy.orm.exc import (
NoResultFound,
)
from flask_babel import lazy_gettext as _
from .config import (
CannotLoadConfiguration,
IntegrationException,
)
from core.selftest import (
HasSelfTests,
)
from core.opds_import import MetadataWranglerOPDSLookup
from core.metadata_layer import (
Metadata,
IdentifierData,
ContributorData,
)
from core.model import (
get_one_or_create,
CustomList,
DataSource,
Edition,
ExternalIntegration,
Identifier,
Representation,
)
from core.external_list import TitleFromExternalList
[docs]class NYTAPI(object):
DATE_FORMAT = "%Y-%m-%d"
# NYT best-seller lists are associated with dates, but fields like
# CustomEntry.first_appearance are timezone-aware datetimes. We
# will interpret a date as meaning midnight of that day in New
# York.
#
# NOTE: entries fetched before we made the datetimes
# timezone-aware will have their time zones set to UTC, but the
# difference is negligible.
TIME_ZONE = dateutil.tz.gettz("America/New York")
[docs] @classmethod
def parse_datetime(cls, d):
"""Used to parse the publication date of a NYT best-seller list.
We take midnight Eastern time to be the publication time.
"""
return datetime.strptime(d, cls.DATE_FORMAT).replace(
tzinfo=cls.TIME_ZONE
)
[docs] @classmethod
def parse_date(cls, d):
"""Used to parse the publication date of a book.
We don't know the timezone here, so the date will end up being
stored as midnight UTC.
"""
return cls.parse_datetime(d).date()
[docs] @classmethod
def date_string(cls, d):
return d.strftime(cls.DATE_FORMAT)
[docs]class NYTBestSellerAPI(NYTAPI, HasSelfTests):
PROTOCOL = ExternalIntegration.NYT
GOAL = ExternalIntegration.METADATA_GOAL
NAME = _("NYT Best Seller API")
CARDINALITY = 1
SETTINGS = [
{ "key": ExternalIntegration.PASSWORD, "label": _("API key"), "required": True },
]
# An NYT integration is shared by all libraries in a circulation manager.
SITEWIDE = True
BASE_URL = "http://api.nytimes.com/svc/books/v3/lists"
LIST_NAMES_URL = BASE_URL + "/names.json"
LIST_URL = BASE_URL + ".json?list=%s"
LIST_OF_LISTS_MAX_AGE = timedelta(days=1)
LIST_MAX_AGE = timedelta(days=1)
HISTORICAL_LIST_MAX_AGE = timedelta(days=365)
[docs] @classmethod
def from_config(cls, _db, **kwargs):
integration = cls.external_integration(_db)
if not integration:
message = "No ExternalIntegration found for the NYT."
raise CannotLoadConfiguration(message)
return cls(_db, api_key=integration.password, **kwargs)
def __init__(self, _db, api_key=None, do_get=None, metadata_client=None):
self.log = logging.getLogger("NYT API")
self._db = _db
if not api_key:
raise CannotLoadConfiguration("No NYT API key is specified")
self.api_key = api_key
self.do_get = do_get or Representation.simple_http_get
if not metadata_client:
try:
metadata_client = MetadataWranglerOPDSLookup.from_config(
self._db
)
except CannotLoadConfiguration as e:
self.log.error(
"Metadata wrangler integration is not configured, proceeding without one."
)
self.metadata_client = metadata_client
[docs] @classmethod
def external_integration(cls, _db):
return ExternalIntegration.lookup(
_db, ExternalIntegration.NYT,
ExternalIntegration.METADATA_GOAL
)
def _run_self_tests(self, _db):
yield self.run_test(
"Getting list of best-seller lists", self.list_of_lists
)
@property
def source(self):
return DataSource.lookup(_db, DataSource.NYT)
[docs] def request(self, path, identifier=None, max_age=LIST_MAX_AGE):
if not path.startswith(self.BASE_URL):
if not path.startswith("/"):
path = "/" + path
url = self.BASE_URL + path
else:
url = path
joiner = '?'
if '?' in url:
joiner = '&'
url += joiner + "api-key=" + self.api_key
representation, cached = Representation.get(
self._db, url, do_get=self.do_get, max_age=max_age, debug=True,
pause_before=0.1)
status = representation.status_code
if status == 200:
# Everything's fine.
content = json.loads(representation.content)
return content
diagnostic = "Response from %s was: %r" % (
url, representation.content.decode("utf-8") if representation.content else ""
)
if status == 403:
raise IntegrationException(
"API authentication failed",
"API key is most likely wrong. %s" % diagnostic
)
else:
raise IntegrationException(
"Unknown API error (status %s)" % status, diagnostic
)
[docs] def list_of_lists(self, max_age=LIST_OF_LISTS_MAX_AGE):
return self.request(self.LIST_NAMES_URL, max_age=max_age)
[docs] def list_info(self, list_name):
list_of_lists = self.list_of_lists()
list_info = [x for x in list_of_lists['results']
if x['list_name_encoded'] == list_name]
if not list_info:
raise ValueError("No such list: %s" % list_name)
return list_info[0]
[docs] def best_seller_list(self, list_info, date=None):
"""Create (but don't update) a NYTBestSellerList object."""
if isinstance(list_info, str):
list_info = self.list_info(list_info)
return NYTBestSellerList(list_info, self.metadata_client)
[docs] def update(self, list, date=None, max_age=LIST_MAX_AGE):
"""Update the given list with data from the given date."""
name = list.foreign_identifier
url = self.LIST_URL % name
if date:
url += "&published-date=%s" % self.date_string(date)
data = self.request(url, max_age=max_age)
list.update(data)
[docs] def fill_in_history(self, list):
"""Update the given list with current and historical data."""
for date in list.all_dates:
self.update(list, date, self.HISTORICAL_LIST_MAX_AGE)
self._db.commit()
[docs]class NYTBestSellerList(list):
def __init__(self, list_info, metadata_client):
self.name = list_info['display_name']
self.created = NYTAPI.parse_datetime(list_info['oldest_published_date'])
self.updated = NYTAPI.parse_datetime(list_info['newest_published_date'])
self.foreign_identifier = list_info['list_name_encoded']
if list_info['updated'] == 'WEEKLY':
frequency = 7
elif list_info['updated'] == 'MONTHLY':
frequency = 30
self.frequency = timedelta(frequency)
self.items_by_isbn = dict()
self.metadata_client = metadata_client
self.log = logging.getLogger("NYT Best-seller list %s" % self.name)
@property
def medium(self):
"""What medium are the books on this list?
Lists like "Audio Fiction" contain audiobooks; all others
contain normal books. (TODO: this isn't quite right; the
distinction between ebooks and print books here exists in a
way it doesn't with most other sources of Editions.)
"""
name = self.name
if not name:
return None
if name.startswith("Audio "):
return Edition.AUDIO_MEDIUM
return Edition.BOOK_MEDIUM
@property
def all_dates(self):
"""Yield a list of estimated dates when new editions of this list were
probably published.
"""
date = self.updated
end = self.created
while date >= end:
yield date
old_date = date
date = date - self.frequency
if old_date > end and date < end:
# We overshot the end date.
yield end
[docs] def update(self, json_data):
"""Update the list with information from the given JSON structure."""
for li_data in json_data.get('results', []):
try:
book = li_data['book_details'][0]
key = (
book.get('primary_isbn13') or book.get('primary_isbn10'))
if key in self.items_by_isbn:
item = self.items_by_isbn[key]
self.log.debug("Previously seen ISBN: %r", key)
else:
item = NYTBestSellerListTitle(li_data, self.medium)
self.items_by_isbn[key] = item
self.append(item)
# self.log.debug("Newly seen ISBN: %r, %s", key, len(self))
except ValueError as e:
# Should only happen when the book has no identifier, which...
# should never happen.
self.log.error("No identifier for %r", li_data)
item = None
continue
# This is the date the *best-seller list* was published,
# not the date the book was published.
list_date = NYTAPI.parse_datetime(li_data['published_date'])
if not item.first_appearance or list_date < item.first_appearance:
item.first_appearance = list_date
if (not item.most_recent_appearance
or list_date > item.most_recent_appearance):
item.most_recent_appearance = list_date
[docs] def to_customlist(self, _db):
"""Turn this NYTBestSeller list into a CustomList object."""
data_source = DataSource.lookup(_db, DataSource.NYT)
l, was_new = get_one_or_create(
_db,
CustomList,
data_source=data_source,
foreign_identifier=self.foreign_identifier,
create_method_kwargs = dict(
created=self.created,
)
)
l.name = self.name
l.updated = self.updated
self.update_custom_list(l)
return l
[docs] def update_custom_list(self, custom_list):
"""Make sure the given CustomList's CustomListEntries reflect
the current state of the NYTBestSeller list.
"""
db = Session.object_session(custom_list)
# Add new items to the list.
for i in self:
list_item, was_new = i.to_custom_list_entry(
custom_list, self.metadata_client)
# If possible, associate the item with a Work.
list_item.set_work()
[docs]class NYTBestSellerListTitle(TitleFromExternalList):
def __init__(self, data, medium):
data = data
try:
bestsellers_date = NYTAPI.parse_datetime(
data.get('bestsellers_date')
)
first_appearance = bestsellers_date
most_recent_appearance = bestsellers_date
except ValueError as e:
first_appearance = None
most_recent_appearance = None
try:
# This is the date the _book_ was published, not the date
# the _bestseller list_ was published.
published_date = NYTAPI.parse_date(data.get('published_date'))
except ValueError as e:
published_date = None
details = data['book_details']
other_isbns = []
if len(details) == 0:
publisher = annotation = primary_isbn10 = primary_isbn13 = title = None
display_author = None
else:
d = details[0]
title = d.get('title', None)
display_author = d.get('author', None)
publisher = d.get('publisher', None)
annotation = d.get('description', None)
primary_isbn10 = d.get('primary_isbn10', None)
primary_isbn13 = d.get('primary_isbn13', None)
# The list of other ISBNs frequently contains ISBNs for
# other books in the same series, as well as ISBNs that
# are just wrong. Assign these equivalencies at a low
# level of confidence.
for isbn in d.get('isbns', []):
isbn13 = isbn.get('isbn13', None)
if isbn13:
other_isbns.append(
IdentifierData(Identifier.ISBN, isbn13, 0.50)
)
primary_isbn = primary_isbn13 or primary_isbn10
if primary_isbn:
primary_isbn = IdentifierData(Identifier.ISBN, primary_isbn, 0.90)
contributors = []
if display_author:
contributors.append(
ContributorData(display_name=display_author)
)
metadata = Metadata(
data_source=DataSource.NYT,
title=title,
medium=medium,
language='eng',
published=published_date,
publisher=publisher,
contributors=contributors,
primary_identifier=primary_isbn,
identifiers=other_isbns,
)
super(NYTBestSellerListTitle, self).__init__(
metadata, first_appearance, most_recent_appearance,
annotation
)