Source code for core.util.titles


import re

from fuzzywuzzy import fuzz

from .permanent_work_id import WorkIDCalculator;



[docs]def normalize_title_for_matching(title): """ Used to standardize book titles before matching them to each other to identify best results in VIAF author search feeds. Run WorkIDCalculator.normalize_title on the name, which will convert to NFKD unicode, de-lint special characters, and lowercase. """ title = WorkIDCalculator.normalize_title(''.join(title)) return title
[docs]def title_match_ratio(title1, title2): """ Returns a number between 0 and 100, representing the percent match (Levenshtein Distance) between book title1 and book title2, after each has been normalized. """ title1 = normalize_title_for_matching(title1) title2 = normalize_title_for_matching(title2) match_ratio = fuzz.ratio(title1, title2) return match_ratio
[docs]def unfluff_title(title): """ Removes parts of the title that are deemed to be add-ons, like imprint information, inserted subtitles and corporate names. For example, in: Hello World, edited by Bob Bobbinson Hello World: The True and Amazing Adventures of Bob Hello World (Unabridged) (TODO: later add logic for something like Hello World, Harvard University, publisher) we want to return "Hello World". """ linted_title = title title_fluff = re.compile(r'(.*) (edited by|compiled by|published by|:|;|\(|\[).*', re.UNICODE) matched_pattern = title_fluff.match(title) if matched_pattern is not None: linted_title = matched_pattern.group(1) # now strip non-word characters title_fluff = re.compile('[\W_]+') linted_title = title_fluff.sub(' ', linted_title) # and remove double spacing that may result title_fluff = re.compile('[ ]+') linted_title = title_fluff.sub(' ', linted_title).lower().strip() return linted_title