Source code for core.util.permanent_work_id


from hashlib import md5
import re
import struct
import unicodedata

[docs]class WorkIDCalculator(object):

[docs]    @classmethod
    def permanent_id(self, normalized_title, normalized_author,
                     grouping_category):
        digest = md5()
        for i in (normalized_title, normalized_author, grouping_category):
            if i == '' or i is None:
                i = '--null--'
            digest.update(i.encode("utf-8"))
        permanent_id = digest.hexdigest().zfill(32)
        permanent_id = "-".join([
            permanent_id[:8], permanent_id[8:12], permanent_id[12:16],
            permanent_id[16:20], permanent_id[20:]])
        return permanent_id

    # Strings to be removed from author names.
    authorExtract1 = re.compile("^(.+?)\\spresents.*$")
    authorExtract2 = re.compile("^(?:(?:a|an)\\s)?(.+?)\\spresentation.*$")
    distributedByRemoval = re.compile("^distributed (?:in.*\\s)?by\\s(.+)$")
    initialsFix = re.compile("(?<=[A-Z])\\.(?=(\\s|[A-Z]|$))")
    apostropheStrip = re.compile("'s")
    specialCharacterStrip = re.compile("[^\\w\\d\\s]", re.U)
    consecutiveCharacterStrip = re.compile("\\s{2,}")
    bracketedCharacterStrip = re.compile("\\[(.*?)\\]")
    commonAuthorSuffixPattern = re.compile("^(.+?)\\s(?:general editor|editor|editor in chief|etc|inc|inc\\setc|co|corporation|llc|partners|company|home entertainment)$")
    commonAuthorPrefixPattern = re.compile("^(?:edited by|by the editors of|by|chosen by|translated by|prepared by|translated and edited by|completely rev by|pictures by|selected and adapted by|with a foreword by|with a new foreword by|introd by|introduction by|intro by|retold by)\\s(.+)$")

    format_to_grouping_category = {
        "Atlas": "other",
        "Map": "other",
        "TapeCartridge": "other",
        "ChipCartridge": "other",
        "DiscCartridge": "other",
        "TapeCassette": "other",
        "TapeReel": "other",
        "FloppyDisk": "other",
        "CDROM": "other",
        "Software": "other",
        "Globe": "other",
        "Braille": "book",
        "Filmstrip": "movie",
        "Transparency": "other",
        "Slide": "other",
        "Microfilm": "other",
        "Collage": "other",
        "Drawing": "other",
        "Painting": "other",
        "Print": "other",
        "Photonegative": "other",
        "FlashCard": "other",
        "Chart": "other",
        "Photo": "other",
        "MotionPicture": "movie",
        "Kit": "other",
        "MusicalScore": "book",
        "SensorImage": "other",
        "SoundDisc": "audio",
        "SoundCassette": "audio",
        "SoundRecording": "audio",
        "VideoCartridge": "movie",
        "VideoDisc": "movie",
        "VideoCassette": "movie",
        "VideoReel": "movie",
        "Video": "movie",
        "MusicalScore": "book",
        "MusicRecording": "music",
        "Electronic": "other",
        "PhysicalObject": "other",
        "Manuscript": "book",
        "eBook": "ebook",
        "Book": "book",
        "Newspaper": "book",
        "Journal": "book",
        "Serial": "book",
        "Unknown": "other",
        "Playaway": "audio",
        "LargePrint": "book",
        "Blu-ray": "movie",
        "DVD": "movie",
        "VerticalFile": "other",
        "CompactDisc": "audio",
        "TapeRecording": "audio",
        "Phonograph": "audio",
        "pdf": "ebook",
        "epub": "ebook",
        "jpg": "other",
        "gif": "other",
        "mp3": "audio",
        "plucker": "ebook",
        "kindle": "ebook",
        "externalLink": "ebook",
        "externalMP3": "audio",
        "interactiveBook": "ebook",
        "overdrive": "ebook",
        "external_web": "ebook",
        "external_ebook": "ebook",
        "external_eaudio": "audio",
        "external_emusic": "music",
        "external_evideo": "movie",
        "text": "ebook",
        "gifs": "other",
        "itunes": "audio",
        "Adobe_EPUB_eBook": "ebook",
        "Kindle_Book": "ebook",
        "Microsoft_eBook": "ebook",
        "OverDrive_WMA_Audiobook": "audio",
        "OverDrive_MP3_Audiobook": "audio",
        "OverDrive_Music": "music",
        "OverDrive_Video": "movie",
        "OverDrive_Read": "ebook",
        "Adobe_PDF_eBook": "ebook",
        "Palm": "ebook",
        "Mobipocket_eBook": "ebook",
        "Disney_Online_Book": "ebook",
        "Open_PDF_eBook": "ebook",
        "Open_EPUB_eBook": "ebook",
        "eContent": "ebook",
        "SeedPacket": "other",
    }

[docs]    @classmethod
    def normalize_author(cls, author):
        """
        Converts to NFKD unicode.
        Strips bracket, special characters, dots out.
        Converts to single-space and strips trailing spaces.
        Strips movie studio language surrouding the possible author's name.
        Lowercases.

        Returns de-linted author's name.
        """
        if author is None or len(author) == 0:
            author = ''
        author = unicodedata.normalize("NFKD", str(author))
        author = cls.bracketedCharacterStrip.sub("", author)
        author = cls.specialCharacterStrip.sub("", author)

        groupingAuthor = cls.initialsFix.sub(" ", author)
        groupingAuthor = groupingAuthor.strip().lower();
        groupingAuthor = cls.consecutiveCharacterStrip.sub(" ", groupingAuthor)

        # extract common additional info (especially for movie studios)
        # Remove home entertainment
        for regexp in [
                cls.authorExtract1, cls.authorExtract2,
                cls.commonAuthorSuffixPattern,
                cls.commonAuthorPrefixPattern, cls.distributedByRemoval
                ]:
            match = regexp.search(groupingAuthor)
            if match:
                groupingAuthor = match.groups()[0]

        # Remove md if the author ends with md
        if groupingAuthor.endswith(" md"):
            groupingAuthor = groupingAuthor[:-3]

        if len(groupingAuthor) > 50:
            groupingAuthor = groupingAuthor[:50]
        groupingAuthor = groupingAuthor.strip()

        # TODO: I don't understand this yet.
        # groupingAuthor = RecordGroupingProcessor.mapAuthorAuthority(groupingAuthor);
        return groupingAuthor

    commonSubtitlesPattern = re.compile("^(.*?)((a|una)\\s(.*)novel(a|la)?|a(.*)memoir|a(.*)mystery|a(.*)thriller|by\\s(.+)|a novel of .*|stories|an autobiography|a biography|a memoir in books|\\d+\S*\s*ed(ition)?|\\d+\S*\s*update|1st\\s+ed.*|an? .* story|a .*\\s?book|poems|the movie|[\\w\\s]+series book \\d+|[\\w\\s]+trilogy book \\d+|large print|graphic novel|magazine|audio cd)$", re.U)

    numerics = []
    for find, replace in (
            ("1st", "first"), ("2nd", "second"), ("3rd", "third"),
            ("4th", "fourth"), ("5th", "fifth"), ("6th", "sixth"),
            ("7th", "seventh"), ("8th", "eighth"), ("9th", "ninth"),
            ("10th", "tenth")):
        numerics.append((re.compile(find), replace))


[docs]    @classmethod
    def normalize_subtitle(cls, original_title):
        if original_title == '':
            return original_title

        subtitle = original_title.replace("&#8211;", "-")
        subtitle = subtitle.replace("&", "and")

        # Remove any bracketed parts of the title
        subtitle = cls.bracketedCharacterStrip.sub("", subtitle)
        subtitle = cls.apostropheStrip.sub("s", subtitle)
        subtitle = cls.specialCharacterStrip.sub("", subtitle)
        subtitle = subtitle.lower().strip()

        subtitle = cls.consecutiveCharacterStrip.sub(" ", subtitle)

        # Remove some common subtitles that are meaningless
        match = cls.commonSubtitlesPattern.search(subtitle)
        if match:
            subtitle = match.groups()[0]
        # Normalize numeric titles
        for find, replace in cls.numerics:
            subtitle = find.sub(replace, subtitle)

        subtitle = subtitle[:175].strip()
        return subtitle

    subtitleIndicator = re.compile("[:;/=]")


[docs]    @classmethod
    def normalize_title(cls, full_title, num_non_filing_characters=0):
        """
        Converts to NFKD unicode.
        Strips bracket, special characters.
        Splits into title and subtitle portions (normalizes subtitle).
        Lowercases.
        """
        if full_title is None:
            full_title = ''
        full_title = unicodedata.normalize("NFKD", full_title)
        # Remove any bracketed parts of the title
        tmp_title = cls.bracketedCharacterStrip.sub("", full_title)
        tmp_title = cls.specialCharacterStrip.sub(
            "", full_title)

        if (num_non_filing_characters > 0
            and num_non_filing_characters < len(full_title)):
            tmp_title = full_title[:num_non_filing_characters]
        else:
            tmp_title = full_title

        tmp_title = cls.make_value_sortable(tmp_title)

        # Make sure we don't strip the entire title
        if len(tmp_title) > 0:
            # And make sure we don't get just special characters
            tmp_title = cls.specialCharacterStrip.sub(" ", tmp_title)
            tmp_title = tmp_title.lower().strip()
        if len(tmp_title) > 0:
            title = tmp_title
        else:
            # print("Just saved us from trimming %s to nothing" % full_title)
            title = cls.specialCharacterStrip.sub("", full_title)


        # If the title includes a : in it, take the first part as the title and the second as the subtitle
        match = cls.subtitleIndicator.search(title)
        if match:
            start = match.start()
            subtitle = cls.normalize_subtitle(title[start+1])
            title = title[:start]

            # Add the subtitle back
            if subtitle is not None and len(subtitle) > 0:
                title = title + " " + subtitle

        # Fix abbreviations
        title = cls.initialsFix.sub(" ", title)

        # Replace '&' with 'and' for better matching
        title = title.replace("&#8211;", "-")
        title = title.replace("&", "and")

        # Remove some common subtitles that are meaningless (do again here in case they were part of the title).
        match = cls.commonSubtitlesPattern.search(title)
        if match and len(match.groups()[0]) != 0:
            title = match.groups()[0]
        title = cls.apostropheStrip.sub("s", title)
        title = cls.specialCharacterStrip.sub(" ", title)
        title = title.lower()
        # Replace consecutive spaces
        title = cls.consecutiveCharacterStrip.sub(" ", title)
        title_end = 100
        if len(title) > title_end:
            title = title[:title_end]
        title = title.strip()
        if not title:
            # print("Title %s was normalized to nothing" % full_title)
            title = full_title
        return title


    sortTrimmingPattern = re.compile("(?i)^(?:(?:a|an|the|el|la|\"|')\\s)(.*)$")
[docs]    @classmethod
    def make_value_sortable(cls, curtitle):
        if not curtitle:
            return ""
        sort_title = curtitle.lower()
        match = cls.sortTrimmingPattern.search(sort_title)
        if match:
            sort_title = match.groups()[0]
        sort_title = sort_title.strip()
        return sort_title
Source code for core.util.permanent_work_id

Library Simplified Circulation Manager

Navigation

Related Topics