Source code for api.onix

import logging

from enum import Enum
from lxml import etree

from core.classifier import Classifier
from core.metadata_layer import (
    Metadata,
    IdentifierData,
    SubjectData,
    ContributorData,
    LinkData,
    CirculationData)
from core.model import (
    Classification,
    Identifier,
    Contributor,
    Hyperlink,
    Representation,
    Subject,
    LicensePool, EditionConstants)
from core.util.datetime_helpers import strptime_utc
from core.util.xmlparser import XMLParser


[docs]class UsageStatus(Enum): UNLIMITED = '01' LIMITED = '02' PROHIBITED = '03'
[docs]class UsageUnit(Enum): COPIES = '01' CHARACTERS = '02' WORDS = '03' PAGES = '04' PERCENTAGE = '05' DEVICES = '06' CONCURRENT_USERS = '07' PERCENTAGE_PER_TIME_PERIOD = '08' DAYS = '09' TIMES = '10'
[docs]class ONIXExtractor(object): """Transform an ONIX file into a list of Metadata objects.""" # TODO: '20' indicates a semicolon-separated list of freeform tags, # which could also be useful. SUBJECT_TYPES = { '01': Classifier.DDC, '03': Classifier.LCC, '04': Classifier.LCSH, '10': Classifier.BISAC, '12': Classifier.BIC, } AUDIENCE_TYPES = { '01': Classifier.AUDIENCE_ADULT, # General/trade for adult audience '02': Classifier.AUDIENCE_CHILDREN, # (not for educational purpose) '03': Classifier.AUDIENCE_YOUNG_ADULT, # (not for educational purpose) '04': Classifier.AUDIENCE_CHILDREN, # Primary and secondary/elementary and high school '05': Classifier.AUDIENCE_ADULT, # College/higher education '06': Classifier.AUDIENCE_ADULT, # Professional and scholarly '07': Classifier.AUDIENCE_ADULT, # ESL '08': Classifier.AUDIENCE_ADULT, # Adult education '09': Classifier.AUDIENCE_ADULT, # Second language teaching other than English } CONTRIBUTOR_TYPES = { 'A01': Contributor.AUTHOR_ROLE, 'A02': Contributor.AUTHOR_ROLE, # 'With or as told to' 'A03': Contributor.AUTHOR_ROLE, # Screenplay author 'A04': Contributor.LYRICIST_ROLE, # Libretto author for an opera 'A05': Contributor.LYRICIST_ROLE, 'A06': Contributor.COMPOSER_ROLE, 'A07': Contributor.ILLUSTRATOR_ROLE, # Visual artist who is the primary creator of the work 'A08': Contributor.PHOTOGRAPHER_ROLE, 'A09': Contributor.AUTHOR_ROLE, # 'Created by' 'A10': Contributor.UNKNOWN_ROLE, # 'From an idea by' 'A11': Contributor.DESIGNER_ROLE, 'A12': Contributor.ILLUSTRATOR_ROLE, 'A13': Contributor.PHOTOGRAPHER_ROLE, 'A14': Contributor.AUTHOR_ROLE, # Author of the text for a work that is primarily photos or illustrations 'A15': Contributor.INTRODUCTION_ROLE, # Preface author 'A16': Contributor.UNKNOWN_ROLE, # Prologue author 'A17': Contributor.UNKNOWN_ROLE, # Summary author 'A18': Contributor.UNKNOWN_ROLE, # Supplement author 'A19': Contributor.AFTERWORD_ROLE, # Afterword author 'A20': Contributor.UNKNOWN_ROLE, # Author of notes or annotations 'A21': Contributor.UNKNOWN_ROLE, # Author of commentary on main text 'A22': Contributor.UNKNOWN_ROLE, # Epilogue author 'A23': Contributor.FOREWORD_ROLE, 'A24': Contributor.INTRODUCTION_ROLE, 'A25': Contributor.UNKNOWN_ROLE, # Author/compiler of footnotes 'A26': Contributor.UNKNOWN_ROLE, # Author of memoir accompanying main text 'A27': Contributor.UNKNOWN_ROLE, # Person who carried out experiments reported in the text 'A29': Contributor.INTRODUCTION_ROLE, # Author of introduction and notes 'A30': Contributor.UNKNOWN_ROLE, # Writer of computer programs ancillary to the text 'A31': Contributor.LYRICIST_ROLE, # 'Book and lyrics by' 'A32': Contributor.CONTRIBUTOR_ROLE, # 'Contributions by' 'A33': Contributor.UNKNOWN_ROLE, # Appendix author 'A34': Contributor.UNKNOWN_ROLE, # Compiler of index 'A35': Contributor.ARTIST_ROLE, # 'Drawings by' 'A36': Contributor.ARTIST_ROLE, # Cover artist 'A37': Contributor.UNKNOWN_ROLE, # Responsible for preliminary work on which the work is based 'A38': Contributor.UNKNOWN_ROLE, # Author of the first edition who is not an author of the current edition 'A39': Contributor.UNKNOWN_ROLE, # 'Maps by' 'A40': Contributor.ARTIST_ROLE, # 'Inked or colored by' 'A41': Contributor.UNKNOWN_ROLE, # 'Paper engineering by' 'A42': Contributor.UNKNOWN_ROLE, # 'Continued by' 'A43': Contributor.UNKNOWN_ROLE, # Interviewer 'A44': Contributor.UNKNOWN_ROLE, # Interviewee 'A45': Contributor.AUTHOR_ROLE, # Writer of dialogue, captions in a comic book 'A46': Contributor.ARTIST_ROLE, # Inker 'A47': Contributor.ARTIST_ROLE, # Colorist 'A48': Contributor.ARTIST_ROLE, # Letterer 'A51': Contributor.UNKNOWN_ROLE, # 'Research by' 'A99': Contributor.UNKNOWN_ROLE, # 'Other primary creator' 'B01': Contributor.EDITOR_ROLE, 'B02': Contributor.EDITOR_ROLE, # 'Revised by' 'B03': Contributor.UNKNOWN_ROLE, # 'Retold by' 'B04': Contributor.UNKNOWN_ROLE, # 'Abridged by' 'B05': Contributor.ADAPTER_ROLE, 'B06': Contributor.TRANSLATOR_ROLE, 'B07': Contributor.UNKNOWN_ROLE, # 'As told by' 'B08': Contributor.TRANSLATOR_ROLE, # With commentary on the translation 'B09': Contributor.EDITOR_ROLE, # Series editor 'B10': Contributor.TRANSLATOR_ROLE, # 'Edited and translated by' 'B11': Contributor.EDITOR_ROLE, # Editor-in-chief 'B12': Contributor.EDITOR_ROLE, # Guest editor 'B13': Contributor.EDITOR_ROLE, # Volume editor 'B14': Contributor.EDITOR_ROLE, # Editorial board member 'B15': Contributor.EDITOR_ROLE, # 'Editorial coordination by' 'B16': Contributor.EDITOR_ROLE, # Managing editor 'B17': Contributor.EDITOR_ROLE, # Founding editor of a serial publication 'B18': Contributor.EDITOR_ROLE, # 'Prepared for publication by' 'B19': Contributor.EDITOR_ROLE, # Associate editor 'B20': Contributor.EDITOR_ROLE, # Consultant editor 'B21': Contributor.EDITOR_ROLE, # General editor 'B22': Contributor.UNKNOWN_ROLE, # 'Dramatized by' 'B23': Contributor.EDITOR_ROLE, # 'General rapporteur' 'B24': Contributor.EDITOR_ROLE, # Literary editor 'B25': Contributor.COMPOSER_ROLE, # 'Arranged by (music)' 'B26': Contributor.EDITOR_ROLE, # Technical editor 'B27': Contributor.UNKNOWN_ROLE, # Thesis advisor 'B28': Contributor.UNKNOWN_ROLE, # Thesis examiner 'B29': Contributor.EDITOR_ROLE, # Scientific editor 'B30': Contributor.UNKNOWN_ROLE, # Historical advisor 'B31': Contributor.UNKNOWN_ROLE, # Editor of the first edition who is not an editor of the current edition 'B99': Contributor.EDITOR_ROLE, # Other type of adaptation or editing 'C01': Contributor.UNKNOWN_ROLE, # 'Compiled by' 'C02': Contributor.UNKNOWN_ROLE, # 'Selected by' 'C03': Contributor.UNKNOWN_ROLE, # 'Non-text material selected by' 'C04': Contributor.UNKNOWN_ROLE, # 'Curated by' 'C99': Contributor.UNKNOWN_ROLE, # Other type of compilation 'D01': Contributor.PRODUCER_ROLE, 'D02': Contributor.DIRECTOR_ROLE, 'D03': Contributor.MUSICIAN_ROLE, # Conductor 'D04': Contributor.UNKNOWN_ROLE, # Choreographer 'D05': Contributor.DIRECTOR_ROLE, # Other type of direction 'E01': Contributor.ACTOR_ROLE, 'E02': Contributor.PERFORMER_ROLE, # Dancer 'E03': Contributor.NARRATOR_ROLE, # 'Narrator' 'E04': Contributor.UNKNOWN_ROLE, # Commentator 'E05': Contributor.PERFORMER_ROLE, # Vocal soloist 'E06': Contributor.PERFORMER_ROLE, # Instrumental soloist 'E07': Contributor.NARRATOR_ROLE, # Reader of recorded text, as in an audiobook 'E08': Contributor.PERFORMER_ROLE, # Name of a musical group in a performing role 'E09': Contributor.PERFORMER_ROLE, # Speaker 'E10': Contributor.UNKNOWN_ROLE, # Presenter 'E99': Contributor.PERFORMER_ROLE, # Other type of performer 'F01': Contributor.PHOTOGRAPHER_ROLE, # 'Filmed/photographed by' 'F02': Contributor.EDITOR_ROLE, # 'Editor (film or video)' 'F99': Contributor.UNKNOWN_ROLE, # Other type of recording 'Z01': Contributor.UNKNOWN_ROLE, # 'Assisted by' 'Z02': Contributor.UNKNOWN_ROLE, # 'Honored/dedicated to' 'Z99': Contributor.UNKNOWN_ROLE, # Other creative responsibility } PRODUCT_CONTENT_TYPES = { '10': EditionConstants.BOOK_MEDIUM, # Text (eye-readable) '01': EditionConstants.AUDIO_MEDIUM # Audiobook } _logger = logging.getLogger(__name__)
[docs] @classmethod def parse(cls, file, data_source_name, default_medium=None): metadata_records = [] # TODO: ONIX has plain language 'reference names' and short tags that # may be used interchangably. This code currently only handles short tags, # and it's not comprehensive. parser = XMLParser() tree = etree.parse(file) root = tree.getroot() for record in root.findall('product'): title = parser.text_of_optional_subtag(record, 'descriptivedetail/titledetail/titleelement/b203') if not title: title_prefix = parser.text_of_optional_subtag(record, 'descriptivedetail/titledetail/titleelement/b030') title_without_prefix = parser.text_of_optional_subtag(record, 'descriptivedetail/titledetail/titleelement/b031') if title_prefix and title_without_prefix: title = title_prefix + " " + title_without_prefix medium = parser.text_of_optional_subtag(record, 'b385') if not medium and default_medium: medium = default_medium else: medium = cls.PRODUCT_CONTENT_TYPES.get(medium, EditionConstants.BOOK_MEDIUM) subtitle = parser.text_of_optional_subtag(record, 'descriptivedetail/titledetail/titleelement/b029') language = parser.text_of_optional_subtag(record, 'descriptivedetail/language/b252') or "eng" publisher = parser.text_of_optional_subtag(record, 'publishingdetail/publisher/b081') imprint = parser.text_of_optional_subtag(record, 'publishingdetail/imprint/b079') if imprint == publisher: imprint = None publishing_date = parser.text_of_optional_subtag(record, 'publishingdetail/publishingdate/b306') issued = None if publishing_date: issued = strptime_utc(publishing_date, "%Y%m%d") identifier_tags = parser._xpath(record, 'productidentifier') identifiers = [] primary_identifier = None for tag in identifier_tags: type = parser.text_of_subtag(tag, "b221") if type == '02' or type == '15': primary_identifier = IdentifierData(Identifier.ISBN, parser.text_of_subtag(tag, 'b244')) identifiers.append(primary_identifier) subject_tags = parser._xpath(record, 'descriptivedetail/subject') subjects = [] weight = Classification.TRUSTED_DISTRIBUTOR_WEIGHT for tag in subject_tags: type = parser.text_of_subtag(tag, 'b067') if type in cls.SUBJECT_TYPES: subjects.append( SubjectData( cls.SUBJECT_TYPES[type], parser.text_of_subtag(tag, 'b069'), weight=weight ) ) audience_tags = parser._xpath(record, 'descriptivedetail/audience/b204') audiences = [] for tag in audience_tags: if tag.text in cls.AUDIENCE_TYPES: subjects.append( SubjectData( Subject.FREEFORM_AUDIENCE, cls.AUDIENCE_TYPES[tag.text], weight=weight ) ) contributor_tags = parser._xpath(record, 'descriptivedetail/contributor') contributors = [] for tag in contributor_tags: type = parser.text_of_subtag(tag, 'b035') if type in cls.CONTRIBUTOR_TYPES: display_name = parser.text_of_subtag(tag, 'b036') sort_name = parser.text_of_optional_subtag(tag, 'b037') family_name = parser.text_of_optional_subtag(tag, 'b040') bio = parser.text_of_optional_subtag(tag, 'b044') contributors.append(ContributorData(sort_name=sort_name, display_name=display_name, family_name=family_name, roles=[cls.CONTRIBUTOR_TYPES[type]], biography=bio)) collateral_tags = parser._xpath(record, 'collateraldetail/textcontent') links = [] for tag in collateral_tags: type = parser.text_of_subtag(tag, 'x426') # TODO: '03' is the summary in the example I'm testing, but that # might not be generally true. if type == '03': text = parser.text_of_subtag(tag, 'd104') links.append(LinkData(rel=Hyperlink.DESCRIPTION, media_type=Representation.TEXT_HTML_MEDIA_TYPE, content=text)) usage_constraint_tags = parser._xpath(record, 'descriptivedetail/epubusageconstraint') licenses_owned = LicensePool.UNLIMITED_ACCESS if usage_constraint_tags: cls._logger.debug('Found {0} EpubUsageConstraint tags'.format(len(usage_constraint_tags))) for usage_constraint_tag in usage_constraint_tags: usage_status = parser.text_of_subtag(usage_constraint_tag, 'x319') cls._logger.debug('EpubUsageStatus: {0}'.format(usage_status)) if usage_status == UsageStatus.PROHIBITED.value: raise Exception('The content is prohibited') elif usage_status == UsageStatus.LIMITED.value: usage_limit_tags = parser._xpath(record, 'descriptivedetail/epubusageconstraint/epubusagelimit') cls._logger.debug('Found {0} EpubUsageLimit tags'.format(len(usage_limit_tags))) if not usage_limit_tags: continue [usage_limit_tag] = usage_limit_tags usage_unit = parser.text_of_subtag(usage_limit_tag, 'x321') cls._logger.debug('EpubUsageUnit: {0}'.format(usage_unit)) if usage_unit == UsageUnit.COPIES.value or usage_status == UsageUnit.CONCURRENT_USERS.value: quantity_limit = parser.text_of_subtag(usage_limit_tag, 'x320') cls._logger.debug('Quantity: {0}'.format(quantity_limit)) if licenses_owned == LicensePool.UNLIMITED_ACCESS: licenses_owned = 0 licenses_owned += int(quantity_limit) metadata_records.append(Metadata( data_source=data_source_name, title=title, subtitle=subtitle, language=language, medium=medium, publisher=publisher, imprint=imprint, issued=issued, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, links=links, circulation=CirculationData( data_source_name, primary_identifier, licenses_owned=licenses_owned, licenses_available=licenses_owned, licenses_reserved=0, patrons_in_hold_queue=0 ) )) return metadata_records