Source code for api.nyt

"""Interface to the New York Times APIs."""
from collections import Counter
from datetime import datetime, timedelta
import dateutil
import isbnlib
import os
import json
import logging
from sqlalchemy.orm.session import Session
from sqlalchemy.orm.exc import (
    NoResultFound,
)
from flask_babel import lazy_gettext as _

from .config import (
    CannotLoadConfiguration,
    IntegrationException,
)

from core.selftest import (
    HasSelfTests,
)
from core.opds_import import MetadataWranglerOPDSLookup
from core.metadata_layer import (
    Metadata,
    IdentifierData,
    ContributorData,
)
from core.model import (
    get_one_or_create,
    CustomList,
    DataSource,
    Edition,
    ExternalIntegration,
    Identifier,
    Representation,
)
from core.external_list import TitleFromExternalList

[docs]class NYTAPI(object):

    DATE_FORMAT = "%Y-%m-%d"

    # NYT best-seller lists are associated with dates, but fields like
    # CustomEntry.first_appearance are timezone-aware datetimes. We
    # will interpret a date as meaning midnight of that day in New
    # York.
    #
    # NOTE: entries fetched before we made the datetimes
    # timezone-aware will have their time zones set to UTC, but the
    # difference is negligible.
    TIME_ZONE = dateutil.tz.gettz("America/New York")

[docs]    @classmethod
    def parse_datetime(cls, d):
        """Used to parse the publication date of a NYT best-seller list.

        We take midnight Eastern time to be the publication time.
        """
        return datetime.strptime(d, cls.DATE_FORMAT).replace(
            tzinfo=cls.TIME_ZONE
        )

[docs]    @classmethod
    def parse_date(cls, d):
        """Used to parse the publication date of a book.

        We don't know the timezone here, so the date will end up being
        stored as midnight UTC.
        """
        return cls.parse_datetime(d).date()

[docs]    @classmethod
    def date_string(cls, d):
        return d.strftime(cls.DATE_FORMAT)


[docs]class NYTBestSellerAPI(NYTAPI, HasSelfTests):

    PROTOCOL = ExternalIntegration.NYT
    GOAL = ExternalIntegration.METADATA_GOAL
    NAME = _("NYT Best Seller API")
    CARDINALITY = 1

    SETTINGS = [
        { "key": ExternalIntegration.PASSWORD, "label": _("API key"), "required": True },
    ]

    # An NYT integration is shared by all libraries in a circulation manager.
    SITEWIDE = True

    BASE_URL = "http://api.nytimes.com/svc/books/v3/lists"

    LIST_NAMES_URL = BASE_URL + "/names.json"
    LIST_URL = BASE_URL + ".json?list=%s"

    LIST_OF_LISTS_MAX_AGE = timedelta(days=1)
    LIST_MAX_AGE = timedelta(days=1)
    HISTORICAL_LIST_MAX_AGE = timedelta(days=365)

[docs]    @classmethod
    def from_config(cls, _db, **kwargs):
        integration = cls.external_integration(_db)

        if not integration:
            message = "No ExternalIntegration found for the NYT."
            raise CannotLoadConfiguration(message)

        return cls(_db, api_key=integration.password, **kwargs)

    def __init__(self, _db, api_key=None, do_get=None, metadata_client=None):
        self.log = logging.getLogger("NYT API")
        self._db = _db
        if not api_key:
            raise CannotLoadConfiguration("No NYT API key is specified")
        self.api_key = api_key
        self.do_get = do_get or Representation.simple_http_get
        if not metadata_client:
            try:
                metadata_client = MetadataWranglerOPDSLookup.from_config(
                    self._db
                )
            except CannotLoadConfiguration as e:
                self.log.error(
                    "Metadata wrangler integration is not configured, proceeding without one."
                )
        self.metadata_client = metadata_client

[docs]    @classmethod
    def external_integration(cls, _db):
        return ExternalIntegration.lookup(
            _db, ExternalIntegration.NYT,
            ExternalIntegration.METADATA_GOAL
        )

    def _run_self_tests(self, _db):
        yield self.run_test(
            "Getting list of best-seller lists", self.list_of_lists
        )

    @property
    def source(self):
        return DataSource.lookup(_db, DataSource.NYT)

[docs]    def request(self, path, identifier=None, max_age=LIST_MAX_AGE):
        if not path.startswith(self.BASE_URL):
            if not path.startswith("/"):
                path = "/" + path
            url = self.BASE_URL + path
        else:
            url = path
        joiner = '?'
        if '?' in url:
            joiner = '&'
        url += joiner + "api-key=" + self.api_key
        representation, cached = Representation.get(
            self._db, url, do_get=self.do_get, max_age=max_age, debug=True,
            pause_before=0.1)
        status = representation.status_code
        if status == 200:
            # Everything's fine.
            content = json.loads(representation.content)
            return content

        diagnostic = "Response from %s was: %r" % (
            url, representation.content.decode("utf-8") if representation.content else ""
        )

        if status == 403:
            raise IntegrationException(
                "API authentication failed",
                "API key is most likely wrong. %s" % diagnostic
            )
        else:
            raise IntegrationException(
                "Unknown API error (status %s)" % status, diagnostic
            )

[docs]    def list_of_lists(self, max_age=LIST_OF_LISTS_MAX_AGE):
        return self.request(self.LIST_NAMES_URL, max_age=max_age)

[docs]    def list_info(self, list_name):
        list_of_lists = self.list_of_lists()
        list_info = [x for x in list_of_lists['results']
                     if x['list_name_encoded'] == list_name]
        if not list_info:
            raise ValueError("No such list: %s" % list_name)
        return list_info[0]

[docs]    def best_seller_list(self, list_info, date=None):
        """Create (but don't update) a NYTBestSellerList object."""
        if isinstance(list_info, str):
            list_info = self.list_info(list_info)
        return NYTBestSellerList(list_info, self.metadata_client)

[docs]    def update(self, list, date=None, max_age=LIST_MAX_AGE):
        """Update the given list with data from the given date."""
        name = list.foreign_identifier
        url = self.LIST_URL % name
        if date:
            url += "&published-date=%s" % self.date_string(date)

        data = self.request(url, max_age=max_age)
        list.update(data)

[docs]    def fill_in_history(self, list):
        """Update the given list with current and historical data."""
        for date in list.all_dates:
            self.update(list, date, self.HISTORICAL_LIST_MAX_AGE)
            self._db.commit()


[docs]class NYTBestSellerList(list):

    def __init__(self, list_info, metadata_client):
        self.name = list_info['display_name']
        self.created = NYTAPI.parse_datetime(list_info['oldest_published_date'])
        self.updated = NYTAPI.parse_datetime(list_info['newest_published_date'])
        self.foreign_identifier = list_info['list_name_encoded']
        if list_info['updated'] == 'WEEKLY':
            frequency = 7
        elif list_info['updated'] == 'MONTHLY':
            frequency = 30
        self.frequency = timedelta(frequency)
        self.items_by_isbn = dict()
        self.metadata_client = metadata_client
        self.log = logging.getLogger("NYT Best-seller list %s" % self.name)

    @property
    def medium(self):
        """What medium are the books on this list?

        Lists like "Audio Fiction" contain audiobooks; all others
        contain normal books. (TODO: this isn't quite right; the
        distinction between ebooks and print books here exists in a
        way it doesn't with most other sources of Editions.)
        """
        name = self.name
        if not name:
            return None
        if name.startswith("Audio "):
            return Edition.AUDIO_MEDIUM
        return Edition.BOOK_MEDIUM

    @property
    def all_dates(self):
        """Yield a list of estimated dates when new editions of this list were
        probably published.
        """
        date = self.updated
        end = self.created
        while date >= end:
            yield date
            old_date = date
            date = date - self.frequency
            if old_date > end and date < end:
                # We overshot the end date.
                yield end

[docs]    def update(self, json_data):
        """Update the list with information from the given JSON structure."""
        for li_data in json_data.get('results', []):
            try:
                book = li_data['book_details'][0]
                key = (
                    book.get('primary_isbn13') or book.get('primary_isbn10'))
                if key in self.items_by_isbn:
                    item = self.items_by_isbn[key]
                    self.log.debug("Previously seen ISBN: %r", key)
                else:
                    item = NYTBestSellerListTitle(li_data, self.medium)
                    self.items_by_isbn[key] = item
                    self.append(item)
                    # self.log.debug("Newly seen ISBN: %r, %s", key, len(self))
            except ValueError as e:
                # Should only happen when the book has no identifier, which...
                # should never happen.
                self.log.error("No identifier for %r", li_data)
                item = None
                continue

            # This is the date the *best-seller list* was published,
            # not the date the book was published.
            list_date = NYTAPI.parse_datetime(li_data['published_date'])
            if not item.first_appearance or list_date < item.first_appearance:
                item.first_appearance = list_date
            if (not item.most_recent_appearance
                or list_date > item.most_recent_appearance):
                item.most_recent_appearance = list_date

[docs]    def to_customlist(self, _db):
        """Turn this NYTBestSeller list into a CustomList object."""
        data_source = DataSource.lookup(_db, DataSource.NYT)
        l, was_new = get_one_or_create(
            _db,
            CustomList,
            data_source=data_source,
            foreign_identifier=self.foreign_identifier,
            create_method_kwargs = dict(
                created=self.created,
            )
        )
        l.name = self.name
        l.updated = self.updated
        self.update_custom_list(l)
        return l

[docs]    def update_custom_list(self, custom_list):
        """Make sure the given CustomList's CustomListEntries reflect
        the current state of the NYTBestSeller list.
        """
        db = Session.object_session(custom_list)

        # Add new items to the list.
        for i in self:
            list_item, was_new = i.to_custom_list_entry(
                custom_list, self.metadata_client)
            # If possible, associate the item with a Work.
            list_item.set_work()


[docs]class NYTBestSellerListTitle(TitleFromExternalList):

    def __init__(self, data, medium):
        data = data
        try:
            bestsellers_date = NYTAPI.parse_datetime(
                data.get('bestsellers_date')
            )
            first_appearance = bestsellers_date
            most_recent_appearance = bestsellers_date
        except ValueError as e:
            first_appearance = None
            most_recent_appearance = None

        try:
            # This is the date the _book_ was published, not the date
            # the _bestseller list_ was published.
            published_date = NYTAPI.parse_date(data.get('published_date'))
        except ValueError as e:
            published_date = None

        details = data['book_details']
        other_isbns = []
        if len(details) == 0:
            publisher = annotation = primary_isbn10 = primary_isbn13 = title = None
            display_author = None
        else:
            d = details[0]
            title = d.get('title', None)
            display_author = d.get('author', None)
            publisher = d.get('publisher', None)
            annotation = d.get('description', None)
            primary_isbn10 = d.get('primary_isbn10', None)
            primary_isbn13 = d.get('primary_isbn13', None)

            # The list of other ISBNs frequently contains ISBNs for
            # other books in the same series, as well as ISBNs that
            # are just wrong. Assign these equivalencies at a low
            # level of confidence.
            for isbn in d.get('isbns', []):
                isbn13 = isbn.get('isbn13', None)
                if isbn13:
                    other_isbns.append(
                        IdentifierData(Identifier.ISBN, isbn13, 0.50)
                    )


        primary_isbn = primary_isbn13 or primary_isbn10
        if primary_isbn:
            primary_isbn = IdentifierData(Identifier.ISBN, primary_isbn, 0.90)

        contributors = []
        if display_author:
            contributors.append(
                ContributorData(display_name=display_author)
            )

        metadata = Metadata(
            data_source=DataSource.NYT,
            title=title,
            medium=medium,
            language='eng',
            published=published_date,
            publisher=publisher,
            contributors=contributors,
            primary_identifier=primary_isbn,
            identifiers=other_isbns,
        )

        super(NYTBestSellerListTitle, self).__init__(
            metadata, first_appearance, most_recent_appearance,
            annotation
        )
Source code for api.nyt

Library Simplified Circulation Manager

Navigation

Related Topics