Source code for core.util

# encoding: utf-8
"""Miscellaneous utilities"""

import re
import string
from collections import Counter

import api.flask_sqlalchemy_session
import sqlalchemy
from money import Money
from sqlalchemy import distinct, select
from sqlalchemy.sql.functions import func

# For backwards compatibility, import items that were moved to 
# languages.py
from .languages import LanguageCodes, LookupTable


[docs]def batch(iterable, size=1): """Split up `iterable` into batches of size `size`.""" l = len(iterable) for start in range(0, l, size): yield iterable[start:min(start+size, l)]
[docs]def fast_query_count(query): """Counts the results of a query without using super-slow subquery""" statement = query.selectable table = statement.froms[0] distinct_columns = statement._distinct_on new_columns = [func.count()] if statement._distinct and isinstance(distinct_columns, (list, tuple)): # When using distinct to select from the db, the distinct # columns need to be incorporated into the count itself. new_columns = [func.count(distinct(func.concat(*distinct_columns)))] count_q = select().with_only_columns(new_columns).select_from(table).order_by(None) count = query.session.execute(count_q).scalar() if query._limit_clause is not None and query._limit_clause.value < count: return query._limit_clause.value return count
[docs]def slugify(text, length_limit=None): """Takes a string and turns it into a slug. :Example: >>> slugify('Some (???) Title Somewhere') some-title-somewhere >>> slugify('Sly & the Family Stone') sly-and-the-family-stone >>> slugify('Happy birthday!', length_limit=4) happ """ slug = re.sub('[.!@#\'$,?\(\)]', '', text.lower()) slug = re.sub('&', ' and ', slug) slug = re.sub(' {2,}', ' ', slug) slug = '-'.join(slug.split(' ')) while '--' in slug: slug = re.sub('--', '-', slug) if length_limit: slug = slug[:length_limit] return str(slug)
[docs]class MetadataSimilarity(object): """Estimate how similar two bits of metadata are.""" SEPARATOR = re.compile("\W") @classmethod def _wordbag(cls, s): return set(cls._wordlist(s)) @classmethod def _wordlist(cls, s): return [x.strip().lower() for x in cls.SEPARATOR.split(s) if x.strip()]
[docs] @classmethod def histogram(cls, strings, stopwords=None): """Create a histogram of word frequencies across the given list of strings. """ histogram = Counter() words = 0 for string in strings: for word in cls._wordlist(string): if not stopwords or word not in stopwords: histogram[word] += 1 words += 1 return cls.normalize_histogram(histogram, words)
[docs] @classmethod def normalize_histogram(cls, histogram, total=None): if not total: total = sum(histogram.values()) total = float(total) for k, v in list(histogram.items()): histogram[k] = v/total return histogram
[docs] @classmethod def histogram_distance(cls, strings_1, strings_2, stopwords=None): """Calculate the histogram distance between two sets of strings. The histogram distance is the sum of the word distance for every word that occurs in either histogram. If a word appears in one histogram but not the other, its word distance is its frequency of appearance. If a word appears in both histograms, its word distance is the absolute value of the difference between that word's frequency of appearance in histogram A, and its frequency of appearance in histogram B. If the strings use the same words at exactly the same frequency, the difference will be 0. If the strings use completely different words, the difference will be 1. """ if not stopwords: stopwords = set(["the", "a", "an"]) histogram_1 = cls.histogram(strings_1, stopwords=stopwords) histogram_2 = cls.histogram(strings_2, stopwords=stopwords) return cls.counter_distance(histogram_1, histogram_2)
[docs] @classmethod def counter_distance(cls, counter1, counter2): differences = [] # For every item that appears in histogram 1, compare its # frequency against the frequency of that item in histogram 2. for k, v in list(counter1.items()): difference = abs(v - counter2.get(k, 0)) differences.append(difference) # Add the frequency of every item that appears in histogram 2 # titles but not in histogram 1. for k, v in list(counter2.items()): if k not in counter1: differences.append(abs(v)) return sum(differences) / 2
[docs] @classmethod def most_common(cls, maximum_size, *items): """Return the most common item that's not longer than the max.""" c = Counter() for i in items: if i and len(i) <= maximum_size: c[i] += 1 common = c.most_common(1) if not common: return None return common[0][0]
@classmethod def _wordbags_for_author(cls, author): bags = [cls._wordbag(author.sort_name)] for alias in author.aliases: bags.append(cls._wordbag(alias)) return bags @classmethod def _matching_author_in(cls, to_match, authors): for author in authors: for name in author: if name in to_match: return name return None @classmethod def _word_match_proportion(cls, s1, s2, stopwords): """What proportion of words do s1 and s2 share, considered as wordbags?""" b1 = cls._wordbag(s1) - stopwords b2 = cls._wordbag(s2) - stopwords return b1, b2, cls._proportion(b1, b2) @classmethod def _proportion(cls, s1, s2): if s1 == s2: return 1 total = len(s1.union(s2)) shared = len(s1.intersection(s2)) if not total: return 0 return shared/float(total)
[docs] @classmethod def title_similarity(cls, title1, title2): if title1 == title2: return 1 if title1 == None or title2 == None: return 0 b1, b2, proportion = cls._word_match_proportion( title1, title2, set(['a', 'the', 'an'])) if not b1.union(b2) in (b1, b2): # Penalize titles where one title is not a subset of the # other. "Tom Sawyer Abroad" will not face an extra # penalty vis-a-vis "Tom Sawyer", but it will face an # extra penalty vis-a-vis "Tom Sawyer, Detective". proportion *= 0.4 return proportion
[docs] @classmethod def author_similarity(cls, authors1, authors2): """What percentage of the total number of authors in the two sets are present in both sets? """ return cls._proportion(set(authors1), set(authors2))
[docs] @classmethod def author_name_similarity(cls, authors1, authors2): """What percentage of the total number of authors in the two sets are present in both sets? """ return cls._proportion( set([x.sort_name for x in authors1]), set([x.sort_name for x in authors2]))
[docs]class TitleProcessor(object): title_stopwords = ['The ', 'A ', 'An ']
[docs] @classmethod def sort_title_for(cls, title): if not title: return title for stopword in cls.title_stopwords: if title.startswith(stopword): title = title[len(stopword):] + ", " + stopword.strip() break return title
[docs] @classmethod def extract_subtitle(cls, main_title, subtitled_title): """Extracts a subtitle given a shorter and longer title version :return: subtitle or None """ if not subtitled_title: return None subtitle = subtitled_title.replace(main_title, '') while (subtitle and (subtitle[0] in string.whitespace+':.')): # Trim any leading whitespace or colons subtitle = subtitle[1:] if not subtitle: # The main title and the full title were the same. return None return subtitle
[docs]class Bigrams(object): all_letters = re.compile("^[a-z]+$") def __init__(self, bigrams): self.bigrams = bigrams self.proportional = Counter() total = float(sum(bigrams.values())) for bigram, quantity in self.bigrams.most_common(): proportion = quantity/total if proportion < 0.001: break self.proportional[bigram] = proportion
[docs] def difference_from(self, other_bigrams): total_difference = 0 for bigram, proportion in list(self.proportional.items()): other_proportion = other_bigrams.proportional[bigram] difference = abs(other_proportion - proportion) total_difference += difference # print("%s %.4f-%.4f = %.4f => %.4f" % (bigram, other_proportion, proportion, difference, total_difference)) for bigram, proportion in list(other_bigrams.proportional.items()): if bigram not in self.proportional: total_difference += proportion # print("%s MISSING %.4f => %.4f" % (bigram, proportion, total_difference)) return total_difference
[docs] @classmethod def from_text_files(cls, paths): bigrams = Counter() for path in paths: cls.process_data(open(path).read(), bigrams) return Bigrams(bigrams)
[docs] @classmethod def from_string(cls, string): bigrams = Counter() cls.process_data(string, bigrams) return Bigrams(bigrams)
[docs] @classmethod def process_data(cls, data, bigrams): for i in range(0, len(data)-1): bigram = data[i:i+2].strip() if len(bigram) == 2 and cls.all_letters.match(bigram): bigrams[bigram.lower()] += 1
english_bigram_frequencies = { "ab": 0.0021712725750437792, "ac": 0.005213707466347486, "ad": 0.004761174757224308, "ag": 0.002362898803662714, "ai": 0.004243783939953184, "ak": 0.0016317710390858545, "al": 0.009420640208489336, "am": 0.0022184421082422864, "an": 0.019261384072027876, "ap": 0.001748220824169669, "ar": 0.010173878691752996, "as": 0.009223117788220589, "at": 0.01276525492184598, "au": 0.0010539442574041427, "av": 0.0018941515675025501, "ay": 0.0026193831404295966, "ba": 0.001463729577066173, "be": 0.005828385445840531, "bl": 0.002477874540834075, "bo": 0.0026577083861533835, "br": 0.0010568923532290493, "bu": 0.002104940418983379, "by": 0.0013163247858208383, "ca": 0.004967541464967778, "ce": 0.006574253689541925, "ch": 0.004742012134362416, "ci": 0.002072511364909405, "ck": 0.0032694382698215223, "cl": 0.0010966916468652897, "co": 0.007743173684117428, "cr": 0.0021771687666935925, "ct": 0.0035804623793491783, "cu": 0.0016922070034964417, "da": 0.0015005807748775066, "de": 0.006702495857925366, "di": 0.003999091986485929, "do": 0.0031824694429867747, "ds": 0.0014740479124533464, "ea": 0.007121125465062116, "ec": 0.005023555285641005, "ed": 0.012741670155246725, "ee": 0.0045415416182687605, "ef": 0.0013325393128578251, "eg": 0.001064262592791316, "ei": 0.0018351896510044163, "el": 0.00604359644105872, "em": 0.003947500309550061, "en": 0.01328117169120465, "ep": 0.0017762277345062824, "er": 0.02339461441854706, "es": 0.010507013519967454, "et": 0.004740538086449962, "ev": 0.0024631340617095416, "ew": 0.001260310965147611, "ex": 0.0015035288707024132, "ey": 0.002464608109621995, "fa": 0.0013045324025212116, "fe": 0.002000283017199191, "ff": 0.0015919717454496141, "fi": 0.0024395492951102883, "fo": 0.004199562502579583, "fr": 0.0018233972677047896, "ft": 0.0012278819110736374, "fu": 0.0010524702094916894, "ga": 0.001748220824169669, "ge": 0.003719022883119793, "gh": 0.0031839434908992282, "gi": 0.0014755219603657997, "go": 0.0016332450869983078, "gr": 0.002012075400498818, "ha": 0.010978708851952524, "he": 0.03081791970566211, "hi": 0.007993761829234497, "ho": 0.00562496683392197, "hr": 0.0012308300068985443, "ht": 0.002037134215010525, "ia": 0.0017334803450451354, "ib": 0.001068684736528676, "ic": 0.007367291466441825, "id": 0.004277687041939611, "ie": 0.0031692030117746947, "if": 0.001578705314237534, "ig": 0.0029687324956810396, "il": 0.004321908479313212, "im": 0.0030615975141656004, "in": 0.02371595686346189, "io": 0.005216655562172393, "ir": 0.003085182280764854, "is": 0.008928308205729919, "it": 0.011687725897842583, "iv": 0.0021801168625184995, "ke": 0.004137652490256543, "ki": 0.0015521724518133737, "ks": 0.0010657366407037694, "la": 0.003962240788674595, "ld": 0.0029628363040312264, "le": 0.009557726664347498, "li": 0.006910336613581288, "ll": 0.007724011061255535, "lo": 0.0036350021521099523, "ls": 0.0010274113949799825, "ly": 0.0051871746039233255, "ma": 0.004444254456046839, "me": 0.00833868904074858, "mi": 0.0025766357509684496, "mo": 0.0027977429378364515, "mp": 0.0029304072499572527, "mu": 0.0011689199945755036, "my": 0.0012558888214102512, "na": 0.0025810578947058093, "nc": 0.002830171991910425, "nd": 0.014385233577632207, "ne": 0.007968703014722791, "ng": 0.01172162899982901, "ni": 0.003599625002211072, "no": 0.004734641894800148, "ns": 0.003651216679146939, "nt": 0.008982847978490693, "ny": 0.0015698610267628138, "oc": 0.0017187398659206019, "od": 0.0017718055907689223, "of": 0.009453069262563311, "oi": 0.0010229892512426224, "ok": 0.0014946845832276932, "ol": 0.0031087670473641076, "om": 0.006596364408228725, "on": 0.015450970218335977, "oo": 0.0033667254320434432, "op": 0.0026650786257156503, "or": 0.012544147734977978, "os": 0.0024248088159857547, "ot": 0.004404455162410599, "ou": 0.012628168465987818, "ov": 0.0014814181520156132, "ow": 0.004357285629212092, "pa": 0.0024248088159857547, "pe": 0.004336648958437745, "ph": 0.001596393889186974, "pi": 0.0014047676605680392, "pl": 0.0026621305298907437, "po": 0.0027137222068266105, "pp": 0.0012897919233966781, "pr": 0.003148566341000348, "pu": 0.002514725738645409, "qu": 0.0011851345216124904, "ra": 0.00642242675455923, "rc": 0.001068684736528676, "rd": 0.0022656116414407935, "re": 0.016922070034964418, "rg": 0.0016332450869983078, "ri": 0.00681010135553446, "rk": 0.0017261101054828686, "rl": 0.001046574017841876, "rm": 0.0016317710390858545, "rn": 0.001328117169120465, "ro": 0.007760862259066869, "rr": 0.0010878473593905697, "rs": 0.004743486182274869, "rt": 0.0029893691664553863, "ru": 0.0013177988337332916, "ry": 0.002701929823526984, "sa": 0.0028758674771964788, "sc": 0.0012691552526223311, "se": 0.008092523039368872, "sh": 0.0038148359974292606, "si": 0.004310116096013585, "so": 0.0036910159727831793, "sp": 0.0017718055907689223, "ss": 0.0034404278276661104, "st": 0.01015324202097865, "su": 0.0018631965613410298, "ta": 0.004728745703150335, "te": 0.013699801298341401, "th": 0.031493033649565745, "ti": 0.009472231885425203, "tl": 0.0016745184285470015, "to": 0.010639677832088254, "tr": 0.0039519224532874216, "ts": 0.0027933207940990913, "tt": 0.0022449749706664464, "tu": 0.0017644353512066555, "ty": 0.0020164975442361777, "ub": 0.0011114321259898233, "uc": 0.0013590721752819853, "ue": 0.0012529407255853443, "ug": 0.0015904976975371608, "ui": 0.0010952175989528364, "ul": 0.0035052859358140577, "um": 0.0012323040548109976, "un": 0.004690420457426548, "up": 0.0018956256154150034, "ur": 0.0044309880248347595, "us": 0.004758226661399402, "ut": 0.007224308818933851, "ve": 0.008085152799806605, "vi": 0.0024498676304974616, "wa": 0.00562938897765933, "we": 0.0034581164026155505, "wh": 0.0031721511075996013, "wi": 0.0038531612431530475, "wn": 0.0013413836003325452, "wo": 0.0029937913101927465, "ye": 0.0010288854428924358, "yo": 0.002843438423122505, "ys": 0.0013649683669317988 } english_bigrams = Bigrams(Counter()) english_bigrams.proportional = Counter(english_bigram_frequencies)
[docs]class MoneyUtility(object): DEFAULT_CURRENCY = 'USD'
[docs] @classmethod def parse(cls, amount): """Attempt to turn a string into a Money object.""" currency = cls.DEFAULT_CURRENCY if not amount: amount = '0' amount = str(amount) if amount[0] == '$': currency = 'USD' amount = amount[1:] return Money(amount, currency)
[docs]def is_session(value): """Return a boolean value indicating whether the value is a valid SQLAlchemy session. :param value: Value :type value: Any :return: Boolean value indicating whether the value is a valid SQLAlchemy session or not :rtype: bool """ return isinstance(value, (sqlalchemy.orm.session.Session, api.flask_sqlalchemy_session.flask_scoped_session))
[docs]def first_or_default(collection, default=None): """Return first element of the specified collection or the default value if the collection is empty. :param collection: Collection :type collection: Iterable :param default: Default value :type default: Any """ element = next(iter(collection), None) if element is None: element = default return element
[docs]def chunks(lst, chunk_size, start_index=0): """Yield successive n-sized chunks from lst.""" length = len(lst) for i in range(start_index, length, chunk_size): yield lst[i:i + chunk_size]