Source code for api.novelist

import json
import logging
import urllib.request
import urllib.parse
import urllib.error
from collections import Counter
from flask_babel import lazy_gettext as _

from core.config import (
    CannotLoadConfiguration,
    Configuration,
)
from core.coverage import (
    CoverageFailure,
    IdentifierCoverageProvider,
)
from core.metadata_layer import (
    ContributorData,
    IdentifierData,
    LinkData,
    MeasurementData,
    Metadata,
    SubjectData,
)
from core.model import (
    DataSource,
    ExternalIntegration,
    Hyperlink,
    Identifier,
    Measurement,
    Representation,
    Session,
    Subject,
    get_one,
    Equivalency,
    LicensePool,
    Collection,
    Edition,
    Contributor,
    Contribution,
)
from core.util import TitleProcessor
from sqlalchemy.sql import (
    select,
    join,
    and_,
    or_,
)
from sqlalchemy.orm import aliased
from core.util.http import HTTP


[docs]class NoveListAPI(object): PROTOCOL = ExternalIntegration.NOVELIST NAME = _("Novelist API") # Hardcoded authentication key used as a Header for calling the NoveList # Collections API. It identifies the client, and lets NoveList know that # SimplyE is making the requests. AUTHORIZED_IDENTIFIER = "62521fa1-bdbb-4939-84aa-aee2a52c8d59" SETTINGS = [ {"key": ExternalIntegration.USERNAME, "label": _("Profile"), "required": True}, {"key": ExternalIntegration.PASSWORD, "label": _("Password"), "required": True}, ] # Different libraries may have different NoveList integrations # on the same circulation manager. SITEWIDE = False IS_CONFIGURED = None _configuration_library_id = None log = logging.getLogger("NoveList API") version = "2.2" NO_ISBN_EQUIVALENCY = "No clear ISBN equivalency: %r" # While the NoveList API doesn't require parameters to be passed via URL, # the Representation object needs a unique URL to return the proper data # from the database. QUERY_ENDPOINT = ( "https://novselect.ebscohost.com/Data/ContentByQuery?" "ISBN=%(ISBN)s&ClientIdentifier=%(ClientIdentifier)s&version=%(version)s" ) COLLECTION_DATA_API = "http://www.noveListcollectiondata.com/api/collections" AUTH_PARAMS = "&profile=%(profile)s&password=%(password)s" MAX_REPRESENTATION_AGE = 7*24*60*60 # one week currentQueryIdentifier = None medium_to_book_format_type_values = { Edition.BOOK_MEDIUM: "EBook", Edition.AUDIO_MEDIUM: "Audiobook", }
[docs] @classmethod def from_config(cls, library): profile, password = cls.values(library) if not (profile and password): raise CannotLoadConfiguration( "No NoveList integration configured for library (%s)." % library.short_name ) _db = Session.object_session(library) return cls(_db, profile, password)
[docs] @classmethod def values(cls, library): _db = Session.object_session(library) integration = ExternalIntegration.lookup( _db, ExternalIntegration.NOVELIST, ExternalIntegration.METADATA_GOAL, library=library ) if not integration: return (None, None) profile = integration.username password = integration.password return (profile, password)
[docs] @classmethod def is_configured(cls, library): if (cls.IS_CONFIGURED is None or library.id != cls._configuration_library_id ): profile, password = cls.values(library) cls.IS_CONFIGURED = bool(profile and password) cls._configuration_library_id = library.id return cls.IS_CONFIGURED
def __init__(self, _db, profile, password): self._db = _db self.profile = profile self.password = password @property def source(self): return DataSource.lookup(self._db, DataSource.NOVELIST)
[docs] def lookup_equivalent_isbns(self, identifier): """Finds NoveList data for all ISBNs equivalent to an identifier. :return: Metadata object or None """ lookup_metadata = [] license_sources = DataSource.license_sources_for(self._db, identifier) # Find strong ISBN equivalents. isbns = list() for license_source in license_sources: isbns += [eq.output for eq in identifier.equivalencies if ( eq.data_source == license_source and eq.strength == 1 and eq.output.type == Identifier.ISBN )] if not isbns: self.log.warning( ("Identifiers without an ISBN equivalent can't" "be looked up with NoveList: %r"), identifier ) return None # Look up metadata for all equivalent ISBNs. lookup_metadata = list() for isbn in isbns: metadata = self.lookup(isbn) if metadata: lookup_metadata.append(metadata) if not lookup_metadata: self.log.warning( ("No NoveList metadata found for Identifiers without an ISBN" "equivalent can't be looked up with NoveList: %r"), identifier ) return None best_metadata, confidence = self.choose_best_metadata( lookup_metadata, identifier ) if best_metadata: if round(confidence, 2) < 0.5: self.log.warning(self.NO_ISBN_EQUIVALENCY, identifier) return None return metadata
@classmethod def _confirm_same_identifier(self, metadata_objects): """Ensures that all metadata objects have the same NoveList ID""" novelist_ids = set([ metadata.primary_identifier.identifier for metadata in metadata_objects ]) return len(novelist_ids) == 1
[docs] def choose_best_metadata(self, metadata_objects, identifier): """Chooses the most likely book metadata from a list of Metadata objects Given several Metadata objects with different NoveList IDs, this method returns the metadata of the ID with the highest representation and a float representing confidence in the result. """ confidence = 1.0 if self._confirm_same_identifier(metadata_objects): # Metadata with the same NoveList ID will be identical. Take one. return metadata_objects[0], confidence # One or more of the equivalents did not return the same NoveList work self.log.warning("%r has inaccurate ISBN equivalents", identifier) counter = Counter() for metadata in metadata_objects: counter[metadata.primary_identifier] += 1 [(target_identifier, most_amount), (ignore, secondmost)] = counter.most_common(2) if most_amount == secondmost: # The counts are the same, and neither can be trusted. self.log.warning(self.NO_ISBN_EQUIVALENCY, identifier) return None, None confidence = most_amount / float(len(metadata_objects)) target_metadata = [ m for m in metadata_objects if m.primary_identifier == target_identifier] return target_metadata[0], confidence
[docs] def lookup(self, identifier, **kwargs): """Requests NoveList metadata for a particular identifier :param kwargs: Keyword arguments passed into Representation.post(). :return: Metadata object or None """ client_identifier = identifier.urn if identifier.type != Identifier.ISBN: return self.lookup_equivalent_isbns(identifier) params = dict( ClientIdentifier=client_identifier, ISBN=identifier.identifier, version=self.version, profile=self.profile, password=self.password ) scrubbed_url = str(self.scrubbed_url(params)) url = self.build_query_url(params) self.log.debug("NoveList lookup: %s", url) # We want to make an HTTP request for `url` but cache the # result under `scrubbed_url`. Define a 'URL normalization' # function that always returns `scrubbed_url`. def normalized_url(original): return scrubbed_url representation, from_cache = Representation.post( _db=self._db, url=str(url), data='', max_age=self.MAX_REPRESENTATION_AGE, response_reviewer=self.review_response, url_normalizer=normalized_url, **kwargs ) # Commit to the database immediately to reduce the chance # that some other incoming request will try to create a # duplicate Representation and crash. self._db.commit() return self.lookup_info_to_metadata(representation)
[docs] @classmethod def review_response(cls, response): """Performs NoveList-specific error review of the request response""" status_code, headers, content = response if status_code == 403: raise Exception("Invalid NoveList credentials") if content.startswith(b'"Missing'): raise Exception("Invalid NoveList parameters: %s" % content.decode("utf-8")) return response
[docs] @classmethod def scrubbed_url(cls, params): """Removes authentication details from cached Representation.url""" return cls.build_query_url(params, include_auth=False)
@classmethod def _scrub_subtitle(cls, subtitle): """Removes common NoveList subtitle annoyances""" if subtitle: subtitle = subtitle.replace('[electronic resource]', '') # Then get rid of any leading whitespace or punctuation. subtitle = TitleProcessor.extract_subtitle('', subtitle) return subtitle
[docs] @classmethod def build_query_url(cls, params, include_auth=True): """Builds a unique and url-encoded query endpoint""" url = cls.QUERY_ENDPOINT if include_auth: url += cls.AUTH_PARAMS urlencoded_params = dict() for name, value in list(params.items()): urlencoded_params[name] = urllib.parse.quote(value) return url % urlencoded_params
[docs] def lookup_info_to_metadata(self, lookup_representation): """Transforms a NoveList JSON representation into a Metadata object""" if not lookup_representation.content: return None lookup_info = json.loads(lookup_representation.content) book_info = lookup_info['TitleInfo'] if book_info: novelist_identifier = book_info.get('ui') if not book_info or not novelist_identifier: # NoveList didn't know the ISBN. return None primary_identifier, ignore = Identifier.for_foreign_id( self._db, Identifier.NOVELIST_ID, novelist_identifier ) metadata = Metadata(self.source, primary_identifier=primary_identifier) # Get the equivalent ISBN identifiers. metadata.identifiers += self._extract_isbns(book_info) author = book_info.get('author') if author: metadata.contributors.append(ContributorData(sort_name=author)) description = book_info.get('description') if description: metadata.links.append(LinkData( rel=Hyperlink.DESCRIPTION, content=description, media_type=Representation.TEXT_PLAIN )) audience_level = book_info.get('audience_level') if audience_level: metadata.subjects.append(SubjectData( Subject.FREEFORM_AUDIENCE, audience_level )) novelist_rating = book_info.get('rating') if novelist_rating: metadata.measurements.append(MeasurementData( Measurement.RATING, novelist_rating )) # Extract feature content if it is available. series_info = None appeals_info = None lexile_info = None goodreads_info = None recommendations_info = None feature_content = lookup_info.get('FeatureContent') if feature_content: series_info = feature_content.get('SeriesInfo') appeals_info = feature_content.get('Appeals') lexile_info = feature_content.get('LexileInfo') goodreads_info = feature_content.get('GoodReads') recommendations_info = feature_content.get('SimilarTitles') metadata, title_key = self.get_series_information( metadata, series_info, book_info ) metadata.title = book_info.get(title_key) subtitle = TitleProcessor.extract_subtitle( metadata.title, book_info.get('full_title') ) metadata.subtitle = self._scrub_subtitle(subtitle) # TODO: How well do we trust this data? We could conceivably bump up # the weight here. if appeals_info: extracted_genres = False for appeal in appeals_info: genres = appeal.get('genres') if genres: for genre in genres: metadata.subjects.append(SubjectData( Subject.TAG, genre['Name'] )) extracted_genres = True if extracted_genres: break if lexile_info: metadata.subjects.append(SubjectData( Subject.LEXILE_SCORE, lexile_info['Lexile'] )) if goodreads_info: metadata.measurements.append(MeasurementData( Measurement.RATING, goodreads_info['average_rating'] )) metadata = self.get_recommendations(metadata, recommendations_info) # If nothing interesting comes from the API, ignore it. if not (metadata.measurements or metadata.series_position or metadata.series or metadata.subjects or metadata.links or metadata.subtitle or metadata.recommendations ): metadata = None return metadata
[docs] def get_series_information(self, metadata, series_info, book_info): """Returns metadata object with series info and optimal title key""" title_key = 'main_title' if series_info: metadata.series = series_info['full_title'] series_titles = series_info.get('series_titles') if series_titles: matching_series_volume = [volume for volume in series_titles if volume.get('full_title') == book_info.get('full_title')] if not matching_series_volume: # If there's no full_title match, try the main_title. matching_series_volume = [volume for volume in series_titles if volume.get('main_title') == book_info.get('main_title')] if len(matching_series_volume) > 1: # This probably won't happen, but if it does, it will be # difficult to debug without an error. raise ValueError("Multiple matching volumes found.") if len(matching_series_volume) > 0: series_position = matching_series_volume[0].get('volume') else: series_position = None if series_position: if series_position.endswith('.'): series_position = series_position[:-1] metadata.series_position = int(series_position) # Sometimes all of the volumes in a series have the same # main_title so using the full_title is preferred. main_titles = [volume.get(title_key) for volume in series_titles] if len(main_titles) > 1 and len(set(main_titles)) == 1: title_key = 'full_title' return metadata, title_key
def _extract_isbns(self, book_info): isbns = [] synonymous_ids = book_info.get('manifestations') for synonymous_id in synonymous_ids: isbn = synonymous_id.get('ISBN') if isbn: isbn_data = IdentifierData(Identifier.ISBN, isbn) isbns.append(isbn_data) return isbns
[docs] def get_recommendations(self, metadata, recommendations_info): if not recommendations_info: return metadata related_books = recommendations_info.get('titles') related_books = [b for b in related_books if b.get('is_held_locally')] if related_books: for book_info in related_books: metadata.recommendations += self._extract_isbns(book_info) return metadata
[docs] def get_items_from_query(self, library): """Gets identifiers and its related title, medium, and authors from the database. Keeps track of the current 'ISBN' identifier and current item object that is being processed. If the next ISBN being processed is new, the existing one gets added to the list of items. If the ISBN is the same, then we append the Author property since there are multiple contributors. :return: a list of Novelist objects to send """ collectionList = [] for c in library.collections: collectionList.append(c.id) LEFT_OUTER_JOIN = True i1 = aliased(Identifier) i2 = aliased(Identifier) roles = list(Contributor.AUTHOR_ROLES) roles.append(Contributor.NARRATOR_ROLE) isbnQuery = select( [i1.identifier, i1.type, i2.identifier, Edition.title, Edition.medium, Edition.published, Contribution.role, Contributor.sort_name, DataSource.name], ).select_from( join(LicensePool, i1, i1.id == LicensePool.identifier_id) .join(Equivalency, i1.id == Equivalency.input_id, LEFT_OUTER_JOIN) .join(i2, Equivalency.output_id == i2.id, LEFT_OUTER_JOIN) .join( Edition, or_(Edition.primary_identifier_id == i1.id, Edition.primary_identifier_id == i2.id) ) .join(Contribution, Edition.id == Contribution.edition_id) .join(Contributor, Contribution.contributor_id == Contributor.id) .join(DataSource, DataSource.id == LicensePool.data_source_id) ).where( and_( LicensePool.collection_id.in_(collectionList), or_(i1.type == "ISBN", i2.type == "ISBN"), or_(Contribution.role.in_(roles)) ) ).order_by(i1.identifier, i2.identifier) result = self._db.execute(isbnQuery) items = [] newItem = None existingItem = None currentIdentifier = None # Loop through the query result. There's a need to keep track of the # previously processed object and the currently processed object because # the identifier could be the same. If it is, we update the data # object to send to Novelist. for item in result: if newItem: existingItem = newItem (currentIdentifier, existingItem, newItem, addItem) = ( self.create_item_object(item, currentIdentifier, existingItem) ) if addItem and existingItem: # The Role property isn't needed in the actual request. del existingItem['role'] items.append(existingItem) # For the case when there's only one item in `result` if newItem: del newItem['role'] items.append(newItem) return items
[docs] def create_item_object(self, object, currentIdentifier, existingItem): """Returns a new item if the current identifier that was processed is not the same as the new object's ISBN being processed. If the new object's ISBN matches the current identifier, the previous object's Author property is updated. :param object: the current item object to process :param currentIdentifier: the current identifier to process :param existingItem: the previously processed item object :return: ( current identifier, the existing object if available, a new object if the item wasn't found before, if the item is ready to the added to the list of books to send ) """ if not object: return (None, None, None, False) if (object[1] == Identifier.ISBN): isbn = object[0] elif object[2] is not None: isbn = object[2] else: # We cannot find an ISBN for this work -- probably due to # a data error. return (None, None, None, False) roles = list(Contributor.AUTHOR_ROLES) roles.append(Contributor.NARRATOR_ROLE) role = object[6] author_or_narrator = object[7] if role in roles else "" distributor = object[8] # If there's no existing author value but we now get one, add it. # If the role is narrator and it's a new value # (i.e. no "narrator" was already added), then add the narrator. # If we encounter an existing ISBN and its role is "Primary Author", # then that value overrides the existing Author property. if isbn == currentIdentifier and existingItem: if not existingItem.get('author') and role in Contributor.AUTHOR_ROLES: existingItem['author'] = author_or_narrator if not existingItem.get('narrator') and role == Contributor.NARRATOR_ROLE: existingItem['narrator'] = author_or_narrator if role == Contributor.PRIMARY_AUTHOR_ROLE: existingItem['author'] = author_or_narrator existingItem['role'] = role # Always return False to keep processing the currentIdentifier until # we get a new ISBN to process. In that case, return and add all # the data we've accumulated for this object. return (currentIdentifier, existingItem, None, False) else: # If we encounter a new ISBN, we take whatever values are initially given. title = object[3] mediaType = self.medium_to_book_format_type_values.get( object[4], "") newItem = dict( isbn=isbn, title=title, mediaType=mediaType, role=role, distributor=distributor ) publicationDate = object[5] if publicationDate: publicationDateString = publicationDate.isoformat().replace("-", "") newItem["publicationDate"] = publicationDateString # If we are processing a new item and there is an existing item, # then we can add the existing item to the list and keep # the current new item for further data aggregation. addItem = True if existingItem else False if role in Contributor.AUTHOR_ROLES: newItem['author'] = author_or_narrator if role == Contributor.NARRATOR_ROLE: newItem['narrator'] = author_or_narrator return (isbn, existingItem, newItem, addItem)
[docs] def put_items_novelist(self, library): items = self.get_items_from_query(library) content = None if items: data = json.dumps(self.make_novelist_data_object(items)) response = self.put( self.COLLECTION_DATA_API, { "AuthorizedIdentifier": self.AUTHORIZED_IDENTIFIER, "Content-Type": "application/json; charset=utf-8" }, data=data ) if (response.status_code == 200): content = json.loads(response.content) logging.info( "Success from NoveList: %r", response.content ) else: logging.error("Data sent was: %r", data) logging.error( "Error %s from NoveList: %r", response.status_code, response.content ) return content
[docs] def make_novelist_data_object(self, items): return { "customer": "%s:%s" % (self.profile, self.password), "records": items, }
[docs] def put(self, url, headers, **kwargs): data = kwargs.get('data') if 'data' in kwargs: del kwargs['data'] # This might take a very long time -- disable the normal # timeout. kwargs['timeout'] = None response = HTTP.put_with_timeout( url, data, headers=headers, **kwargs ) return response
[docs]class MockNoveListAPI(NoveListAPI): def __init__(self, _db, *args, **kwargs): self._db = _db self.responses = []
[docs] def setup_method(self, *args): self.responses = self.responses + list(args)
[docs] def lookup(self, identifier): if not self.responses: return [] response = self.responses[0] self.responses = self.responses[1:] return response