# encoding: utf-8
# If the genre classification does not match the fiction classification, throw
# away the genre classifications.
#
# E.g. "Investigations -- nonfiction" maps to Mystery, but Mystery
# conflicts with Nonfiction.
# SQL to find commonly used DDC classifications
# select count(editions.id) as c, subjects.identifier from editions join identifiers on workrecords.primary_identifier_id=workidentifiers.id join classifications on workidentifiers.id=classifications.work_identifier_id join subjects on classifications.subject_id=subjects.id where subjects.type = 'DDC' and not subjects.identifier like '8%' group by subjects.identifier order by c desc;
# SQL to find commonly used classifications not assigned to a genre
# select count(identifiers.id) as c, subjects.type, substr(subjects.identifier, 0, 20) as i, substr(subjects.name, 0, 20) as n from workidentifiers join classifications on workidentifiers.id=classifications.work_identifier_id join subjects on classifications.subject_id=subjects.id where subjects.genre_id is null and subjects.fiction is null group by subjects.type, i, n order by c desc;
import logging
import json
import os
import pkgutil
import re
from urllib.parse import urlparse
from collections import (
Counter,
defaultdict,
)
from sqlalchemy.orm.session import Session
from sqlalchemy.sql.expression import and_
base_dir = os.path.split(__file__)[0]
resource_dir = os.path.join(base_dir, "..", "resources")
NO_VALUE = "NONE"
NO_NUMBER = -1
[docs]class ClassifierConstants(object):
DDC = "DDC"
LCC = "LCC"
LCSH = "LCSH"
FAST = "FAST"
OVERDRIVE = "Overdrive"
RBDIGITAL = "RBdigital"
BISAC = "BISAC"
BIC = "BIC"
TAG = "tag" # Folksonomic tags.
# Appeal controlled vocabulary developed by NYPL
NYPL_APPEAL = "NYPL Appeal"
GRADE_LEVEL = "Grade level" # "1-2", "Grade 4", "Kindergarten", etc.
AGE_RANGE = "schema:typicalAgeRange" # "0-2", etc.
AXIS_360_AUDIENCE = "Axis 360 Audience"
RBDIGITAL_AUDIENCE = "RBdigital Audience"
# We know this says something about the audience but we're not sure what.
# Could be any of the values from GRADE_LEVEL or AGE_RANGE, plus
# "YA", "Adult", etc.
FREEFORM_AUDIENCE = "schema:audience"
GUTENBERG_BOOKSHELF = "gutenberg:bookshelf"
TOPIC = "schema:Topic"
PLACE = "schema:Place"
PERSON = "schema:Person"
ORGANIZATION = "schema:Organization"
LEXILE_SCORE = "Lexile"
ATOS_SCORE = "ATOS"
INTEREST_LEVEL = "Interest Level"
AUDIENCE_ADULT = "Adult"
AUDIENCE_ADULTS_ONLY = "Adults Only"
AUDIENCE_YOUNG_ADULT = "Young Adult"
AUDIENCE_CHILDREN = "Children"
AUDIENCE_ALL_AGES = "All Ages"
AUDIENCE_RESEARCH = "Research"
# A book for a child younger than 14 is a children's book.
# A book for a child 14 or older is a young adult book.
YOUNG_ADULT_AGE_CUTOFF = 14
ADULT_AGE_CUTOFF = 18
# "All ages" actually means "all ages with reading fluency".
ALL_AGES_AGE_CUTOFF = 8
AUDIENCES_YOUNG_CHILDREN = [AUDIENCE_CHILDREN, AUDIENCE_ALL_AGES]
AUDIENCES_JUVENILE = AUDIENCES_YOUNG_CHILDREN + [AUDIENCE_YOUNG_ADULT]
AUDIENCES_ADULT = [AUDIENCE_ADULT, AUDIENCE_ADULTS_ONLY, AUDIENCE_ALL_AGES]
AUDIENCES = set([AUDIENCE_ADULT, AUDIENCE_ADULTS_ONLY, AUDIENCE_YOUNG_ADULT,
AUDIENCE_CHILDREN, AUDIENCE_ALL_AGES, AUDIENCE_RESEARCH])
SIMPLIFIED_GENRE = "http://librarysimplified.org/terms/genres/Simplified/"
SIMPLIFIED_FICTION_STATUS = "http://librarysimplified.org/terms/fiction/"
[docs]class Classifier(ClassifierConstants):
"""Turn an external classification into an internal genre, an
audience, an age level, and a fiction status.
"""
AUDIENCES_NO_RESEARCH = [
x for x in ClassifierConstants.AUDIENCES if x != ClassifierConstants.AUDIENCE_RESEARCH
]
classifiers = dict()
[docs] @classmethod
def range_tuple(cls, lower, upper):
"""Turn a pair of ages into a tuple that represents an age range.
This may be turned into an inclusive postgres NumericRange later,
but this code should not depend on postgres.
"""
# Just in case the upper and lower ranges are mixed up,
# and no prior code caught this, un-mix them.
if lower and upper and lower > upper:
lower, upper = upper, lower
return (lower, upper)
[docs] @classmethod
def lookup(cls, scheme):
"""Look up a classifier for a classification scheme."""
return cls.classifiers.get(scheme, None)
[docs] @classmethod
def name_for(cls, identifier):
"""Look up a human-readable name for the given identifier."""
return None
[docs] @classmethod
def classify(cls, subject):
"""Try to determine genre, audience, target age, and fiction status
for the given Subject.
"""
identifier, name = cls.scrub_identifier_and_name(
subject.identifier, subject.name
)
fiction = cls.is_fiction(identifier, name)
audience = cls.audience(identifier, name)
target_age = cls.target_age(identifier, name)
if target_age == cls.range_tuple(None, None):
target_age = cls.default_target_age_for_audience(audience)
return (cls.genre(identifier, name, fiction, audience),
audience,
target_age,
fiction,
)
[docs] @classmethod
def scrub_identifier_and_name(cls, identifier, name):
"""Prepare identifier and name from within a call to classify()."""
identifier = cls.scrub_identifier(identifier)
if isinstance(identifier, tuple):
# scrub_identifier returned a canonical value for name as
# well. Use it in preference to any name associated with
# the subject.
identifier, name = identifier
elif not name:
name = identifier
name = cls.scrub_name(name)
return identifier, name
[docs] @classmethod
def scrub_identifier(cls, identifier):
"""Prepare an identifier from within a call to classify().
This may involve data normalization, conversion to lowercase,
etc.
"""
if identifier is None:
return None
return Lowercased(identifier)
[docs] @classmethod
def scrub_name(cls, name):
"""Prepare a name from within a call to classify()."""
if name is None:
return None
return Lowercased(name)
[docs] @classmethod
def genre(cls, identifier, name, fiction=None, audience=None):
"""Is this identifier associated with a particular Genre?"""
return None
[docs] @classmethod
def genre_match(cls, query):
"""Does this query string match a particular Genre, and which part
of the query matches?"""
return None, None
[docs] @classmethod
def is_fiction(cls, identifier, name):
"""Is this identifier+name particularly indicative of fiction?
How about nonfiction?
"""
if "nonfiction" in name:
return False
if "fiction" in name:
return True
return None
[docs] @classmethod
def audience(cls, identifier, name):
"""What does this identifier+name say about the audience for
this book?
"""
if 'juvenile' in name:
return cls.AUDIENCE_CHILDREN
elif 'young adult' in name or "YA" in name.original:
return cls.AUDIENCE_YOUNG_ADULT
return None
[docs] @classmethod
def audience_match(cls, query):
"""Does this query string match a particular Audience, and which
part of the query matches?"""
return (None, None)
[docs] @classmethod
def target_age(cls, identifier, name):
"""For children's books, what does this identifier+name say
about the target age for this book?
"""
return cls.range_tuple(None, None)
[docs] @classmethod
def default_target_age_for_audience(cls, audience):
"""The default target age for a given audience.
We don't know what age range a children's book is appropriate
for, but we can make a decent guess for a YA book, for an
'Adult' book it's pretty clear, and for an 'Adults Only' book
it's very clear.
"""
if audience == Classifier.AUDIENCE_YOUNG_ADULT:
return cls.range_tuple(14, 17)
elif audience in (
Classifier.AUDIENCE_ADULT, Classifier.AUDIENCE_ADULTS_ONLY
):
return cls.range_tuple(18, None)
return cls.range_tuple(None, None)
[docs] @classmethod
def default_audience_for_target_age(cls, range):
if range is None:
return None
lower = range[0]
upper = range[1]
if not lower and not upper:
# You could interpret this as 'all ages' but it's more
# likely the data is simply missing.
return None
if not lower:
if upper >= cls.ADULT_AGE_CUTOFF:
# e.g. "up to 20 years", though this doesn't
# really make sense.
#
# The 'all ages' interpretation is more plausible here
# but it's still more likely that this is simply a
# book for grown-ups and no lower bound was provided.
return cls.AUDIENCE_ADULT
elif upper > cls.YOUNG_ADULT_AGE_CUTOFF:
# e.g. "up to 15 years"
return cls.AUDIENCE_YOUNG_ADULT
else:
# e.g. "up to 14 years"
return cls.AUDIENCE_CHILDREN
# At this point we can assume that lower is not None.
if lower >= 18:
return cls.AUDIENCE_ADULT
elif lower >= cls.YOUNG_ADULT_AGE_CUTOFF:
return cls.AUDIENCE_YOUNG_ADULT
elif lower <= cls.ALL_AGES_AGE_CUTOFF and (
upper is not None and upper >= cls.ADULT_AGE_CUTOFF
):
# e.g. "for children ages 7-77". The 'all ages' reading
# is here the most plausible.
return cls.AUDIENCE_ALL_AGES
elif lower >= 12 and (not upper or upper >= cls.YOUNG_ADULT_AGE_CUTOFF):
# Although we treat "Young Adult" as starting at 14, many
# outside sources treat it as starting at 12. As such we
# treat "12 and up" or "12-14" as an indicator of a Young
# Adult audience, with a target age that overlaps what we
# consider a Children audience.
return cls.AUDIENCE_YOUNG_ADULT
else:
return cls.AUDIENCE_CHILDREN
[docs] @classmethod
def and_up(cls, young, keyword):
"""Encapsulates the logic of what "[x] and up" actually means.
Given the lower end of an age range, tries to determine the
upper end of the range.
"""
if young is None:
return None
if not any(
[keyword.endswith(x) for x in
("and up", "and up.", "+", "+.")
]
):
return None
if young >= 18:
old = young
elif young >= 12:
# "12 and up", "14 and up", etc. are
# generally intended to cover the entire
# YA span.
old = 17
elif young >= 8:
# "8 and up" means something like "8-12"
old = young + 4
else:
# Whereas "3 and up" really means more
# like "3 to 5".
old = young + 2
return old
class GradeLevelClassifier(Classifier):
# How old a kid is when they start grade N in the US.
american_grade_to_age = {
# Preschool: 3-4 years
'preschool' : 3,
'pre-school' : 3,
'p' : 3,
'pk' : 4,
# Easy readers
'kindergarten' : 5,
'k' : 5,
'0' : 5,
'first' : 6,
'1' : 6,
'second' : 7,
'2' : 7,
# Chapter Books
'third' : 8,
'3' : 8,
'fourth' : 9,
'4' : 9,
'fifth' : 10,
'5' : 10,
'sixth' : 11,
'6' : 11,
'7' : 12,
'8' : 13,
# YA
'9' : 14,
'10' : 15,
'11' : 16,
'12': 17,
}
# Regular expressions that match common ways of expressing grade
# levels.
grade_res = [
re.compile(x, re.I) for x in [
"grades? ([kp0-9]+) to ([kp0-9]+)?",
"grades? ([kp0-9]+) ?-? ?([kp0-9]+)?",
"gr\.? ([kp0-9]+) ?-? ?([kp0-9]+)?",
"grades?: ([kp0-9]+) to ([kp0-9]+)",
"grades?: ([kp0-9]+) ?-? ?([kp0-9]+)?",
"gr\.? ([kp0-9]+)",
"([0-9]+)[tnsr][hdt] grade",
"([a-z]+) grade",
r'\b(kindergarten|preschool)\b',
]
]
generic_grade_res = [
re.compile(r"([kp0-9]+) ?- ?([0-9]+)", re.I),
re.compile(r"([kp0-9]+) ?to ?([0-9]+)", re.I),
re.compile(r"^([0-9]+)\b", re.I),
re.compile(r"^([kp])\b", re.I),
]
@classmethod
def audience(cls, identifier, name, require_explicit_age_marker=False):
target_age = cls.target_age(identifier, name, require_explicit_age_marker)
return cls.default_audience_for_target_age(target_age)
@classmethod
def target_age(cls, identifier, name, require_explicit_grade_marker=False):
if (identifier and "education" in identifier) or (name and 'education' in name):
# This is a book about teaching, e.g. fifth grade.
return cls.range_tuple(None, None)
if (identifier and 'grader' in identifier) or (name and 'grader' in name):
# This is a book about, e.g. fifth graders.
return cls.range_tuple(None, None)
if require_explicit_grade_marker:
res = cls.grade_res
else:
res = cls.grade_res + cls.generic_grade_res
for r in res:
for k in identifier, name:
if not k:
continue
m = r.search(k)
if m:
gr = m.groups()
if len(gr) == 1:
young = gr[0]
old = None
else:
young, old = gr
# Strip leading zeros
if young and young.lstrip('0'):
young = young.lstrip("0")
if old and old.lstrip('0'):
old = old.lstrip("0")
young = cls.american_grade_to_age.get(young)
old = cls.american_grade_to_age.get(old)
if not young and not old:
return cls.range_tuple(None, None)
if young:
young = int(young)
if old:
old = int(old)
if old is None:
old = cls.and_up(young, k)
if old is None and young is not None:
old = young
if young is None and old is not None:
young = old
if old and young and old < young:
young, old = old, young
return cls.range_tuple(young, old)
return cls.range_tuple(None, None)
@classmethod
def target_age_match(cls, query):
target_age = None
grade_words = None
target_age = cls.target_age(None, query, require_explicit_grade_marker=True)
if target_age:
for r in cls.grade_res:
match = r.search(query)
if match:
grade_words = match.group()
break
return (target_age, grade_words)
class InterestLevelClassifier(Classifier):
@classmethod
def audience(cls, identifier, name):
if identifier in ('lg', 'mg+', 'mg'):
return cls.AUDIENCE_CHILDREN
elif identifier == 'ug':
return cls.AUDIENCE_YOUNG_ADULT
else:
return None
@classmethod
def target_age(cls, identifier, name):
if identifier == 'lg':
return cls.range_tuple(5,8)
if identifier in ('mg+', 'mg'):
return cls.range_tuple(9,13)
if identifier == 'ug':
return cls.range_tuple(14,17)
return None
class AgeClassifier(Classifier):
# Regular expressions that match common ways of expressing ages.
age_res = [
re.compile(x, re.I) for x in [
"age ([0-9]+) ?-? ?([0-9]+)?",
"age: ([0-9]+) ?-? ?([0-9]+)?",
"age: ([0-9]+) to ([0-9]+)",
"ages ([0-9]+) ?- ?([0-9]+)",
"([0-9]+) ?- ?([0-9]+) years?",
"([0-9]+) years?",
"ages ([0-9]+)+",
"([0-9]+) and up",
"([0-9]+) years? and up",
]
]
generic_age_res = [
re.compile("([0-9]+) ?- ?([0-9]+)", re.I),
re.compile(r"^([0-9]+)\b", re.I),
]
baby_re = re.compile("^baby ?- ?([0-9]+) year", re.I)
@classmethod
def audience(cls, identifier, name, require_explicit_age_marker=False):
target_age = cls.target_age(identifier, name, require_explicit_age_marker)
return cls.default_audience_for_target_age(target_age)
@classmethod
def target_age(cls, identifier, name, require_explicit_age_marker=False):
if require_explicit_age_marker:
res = cls.age_res
else:
res = cls.age_res + cls.generic_age_res
if identifier:
match = cls.baby_re.search(identifier)
if match:
# This is for babies.
upper_bound = int(match.groups()[0])
return cls.range_tuple(0, upper_bound)
for r in res:
for k in identifier, name:
if not k:
continue
m = r.search(k)
if m:
groups = m.groups()
young = old = None
if groups:
young = int(groups[0])
if len(groups) > 1 and groups[1] != None:
old = int(groups[1])
if old is None:
old = cls.and_up(young, k)
if old is None and young is not None:
old = young
if young is None and old is not None:
young = old
if old > 99:
# This is not an age at all.
old = None
if young > 99:
# This is not an age at all.
young = None
if young > old:
young, old = old, young
return cls.range_tuple(young, old)
return cls.range_tuple(None, None)
@classmethod
def target_age_match(cls, query):
target_age = None
age_words = None
target_age = cls.target_age(None, query, require_explicit_age_marker=True)
if target_age:
for r in cls.age_res:
match = r.search(query)
if match:
age_words = match.group()
break
return (target_age, age_words)
# This is the large-scale structure of our classification system.
#
# If the name of a genre is a string, it's the name of the genre
# and there are no subgenres.
#
# If the name of a genre is a dictionary, the 'name' argument is the
# name of the genre, and the 'subgenres' argument is the list of the
# subgenres.
COMICS_AND_GRAPHIC_NOVELS = "Comics & Graphic Novels"
fiction_genres = [
"Adventure",
"Classics",
COMICS_AND_GRAPHIC_NOVELS,
"Drama",
dict(name="Erotica", audiences=Classifier.AUDIENCE_ADULTS_ONLY),
dict(name="Fantasy", subgenres=[
"Epic Fantasy",
"Historical Fantasy",
"Urban Fantasy",
]),
"Folklore",
"Historical Fiction",
dict(name="Horror", subgenres=[
"Gothic Horror",
"Ghost Stories",
"Vampires",
"Werewolves",
"Occult Horror",
]),
"Humorous Fiction",
"Literary Fiction",
"LGBTQ Fiction",
dict(name="Mystery", subgenres=[
"Crime & Detective Stories",
"Hard-Boiled Mystery",
"Police Procedural",
"Cozy Mystery",
"Historical Mystery",
"Paranormal Mystery",
"Women Detectives",
]),
"Poetry",
"Religious Fiction",
dict(name="Romance", subgenres=[
"Contemporary Romance",
"Gothic Romance",
"Historical Romance",
"Paranormal Romance",
"Western Romance",
"Romantic Suspense",
]),
dict(name="Science Fiction", subgenres=[
"Dystopian SF",
"Space Opera",
"Cyberpunk",
"Military SF",
"Alternative History",
"Steampunk",
"Romantic SF",
"Media Tie-in SF",
]),
"Short Stories",
dict(name="Suspense/Thriller",
subgenres=[
"Historical Thriller",
"Espionage",
"Supernatural Thriller",
"Medical Thriller",
"Political Thriller",
"Psychological Thriller",
"Technothriller",
"Legal Thriller",
"Military Thriller",
],
),
"Urban Fiction",
"Westerns",
"Women's Fiction",
]
nonfiction_genres = [
dict(name="Art & Design", subgenres=[
"Architecture",
"Art",
"Art Criticism & Theory",
"Art History",
"Design",
"Fashion",
"Photography",
]),
"Biography & Memoir",
"Education",
dict(name="Personal Finance & Business", subgenres=[
"Business",
"Economics",
"Management & Leadership",
"Personal Finance & Investing",
"Real Estate",
]),
dict(name="Parenting & Family", subgenres=[
"Family & Relationships",
"Parenting",
]),
dict(name="Food & Health", subgenres=[
"Bartending & Cocktails",
"Cooking",
"Health & Diet",
"Vegetarian & Vegan",
]),
dict(name="History", subgenres=[
"African History",
"Ancient History",
"Asian History",
"Civil War History",
"European History",
"Latin American History",
"Medieval History",
"Middle East History",
"Military History",
"Modern History",
"Renaissance & Early Modern History",
"United States History",
"World History",
]),
dict(name="Hobbies & Home", subgenres=[
"Antiques & Collectibles",
"Crafts & Hobbies",
"Gardening",
"Games",
"House & Home",
"Pets",
]),
"Humorous Nonfiction",
dict(name="Entertainment", subgenres=[
"Film & TV",
"Music",
"Performing Arts",
]),
"Life Strategies",
"Literary Criticism",
"Periodicals",
"Philosophy",
"Political Science",
dict(name="Reference & Study Aids", subgenres=[
"Dictionaries",
"Foreign Language Study",
"Law",
"Study Aids",
]),
dict(name="Religion & Spirituality", subgenres=[
"Body, Mind & Spirit",
"Buddhism",
"Christianity",
"Hinduism",
"Islam",
"Judaism",
]),
dict(name="Science & Technology", subgenres=[
"Computers",
"Mathematics",
"Medical",
"Nature",
"Psychology",
"Science",
"Social Sciences",
"Technology",
]),
"Self-Help",
"Sports",
"Travel",
"True Crime",
]
[docs]class GenreData(object):
def __init__(self, name, is_fiction, parent=None, audience_restriction=None):
self.name = name
self.parent = parent
self.is_fiction = is_fiction
self.subgenres = []
if isinstance(audience_restriction, str):
audience_restriction = [audience_restriction]
self.audience_restriction = audience_restriction
def __repr__(self):
return "<GenreData: %s>" % self.name
@property
def self_and_subgenres(self):
yield self
for child in self.all_subgenres:
yield child
@property
def all_subgenres(self):
for child in self.subgenres:
for subgenre in child.self_and_subgenres:
yield subgenre
@property
def parents(self):
parents = []
p = self.parent
while p:
parents.append(p)
p = p.parent
return reversed(parents)
[docs] def has_subgenre(self, subgenre):
for s in self.subgenres:
if s == subgenre or s.has_subgenre(subgenre):
return True
return False
@property
def variable_name(self):
return self.name.replace("-", "_").replace(", & ", "_").replace(", ", "_").replace(" & ", "_").replace(" ", "_").replace("/", "_").replace("'", "")
[docs] @classmethod
def populate(cls, namespace, genres, fiction_source, nonfiction_source):
"""Create a GenreData object for every genre and subgenre in the given
list of fiction and nonfiction genres.
"""
for source, default_fiction in (
(fiction_source, True),
(nonfiction_source, False)):
for item in source:
subgenres = []
audience_restriction = None
name = item
fiction = default_fiction
if isinstance(item, dict):
name = item['name']
subgenres = item.get('subgenres', [])
audience_restriction = item.get('audience_restriction')
fiction = item.get('fiction', default_fiction)
cls.add_genre(
namespace, genres, name, subgenres, fiction,
None, audience_restriction)
[docs] @classmethod
def add_genre(cls, namespace, genres, name, subgenres, fiction,
parent, audience_restriction):
"""Create a GenreData object. Add it to a dictionary and a namespace.
"""
if isinstance(name, tuple):
name, default_fiction = name
default_fiction = None
default_audience = None
if parent:
default_fiction = parent.is_fiction
default_audience = parent.audience_restriction
if isinstance(name, dict):
data = name
subgenres = data.get('subgenres', [])
name = data['name']
fiction = data.get('fiction', default_fiction)
audience_restriction = data.get('audience', default_audience)
if name in genres:
raise ValueError("Duplicate genre name! %s" % name)
# Create the GenreData object.
genre_data = GenreData(name, fiction, parent, audience_restriction)
if parent:
parent.subgenres.append(genre_data)
# Add the genre to the given dictionary, keyed on name.
genres[genre_data.name] = genre_data
# Convert the name to a Python-safe variable name,
# and add it to the given namespace.
namespace[genre_data.variable_name] = genre_data
# Do the same for subgenres.
for sub in subgenres:
cls.add_genre(namespace, genres, sub, [], fiction,
genre_data, audience_restriction)
genres = dict()
GenreData.populate(globals(), genres, fiction_genres, nonfiction_genres)
[docs]class Lowercased(str):
"""A lowercased string that remembers its original value."""
def __new__(cls, value):
if isinstance(value, Lowercased):
# Nothing to do.
return value
if not isinstance(value, str):
value = str(value)
new_value = value.lower()
if new_value.endswith('.'):
new_value = new_value[:-1]
o = super(Lowercased, cls).__new__(cls, new_value)
o.original = value
return o
[docs] @classmethod
def scrub_identifier(cls, identifier):
if not identifier:
return identifier
[docs]class AgeOrGradeClassifier(Classifier):
[docs] @classmethod
def audience(cls, identifier, name):
audience = AgeClassifier.audience(identifier, name)
if audience == None:
audience = GradeLevelClassifier.audience(identifier, name)
return audience
[docs] @classmethod
def target_age(cls, identifier, name):
"""This tag might contain a grade level, an age in years, or nothing.
We will try both a grade level and an age in years, but we
will require that the tag indicate what's being measured. A
tag like "9-12" will not match anything because we don't know if it's
age 9-12 or grade 9-12.
"""
age = AgeClassifier.target_age(identifier, name, True)
if age == cls.range_tuple(None, None):
age = GradeLevelClassifier.target_age(identifier, name, True)
return age
[docs]class WorkClassifier(object):
"""Boil down a bunch of Classification objects into a few values."""
# TODO: This needs a lot of additions.
genre_publishers = {
"Harlequin" : Romance,
"Pocket Books/Star Trek" : Media_Tie_in_SF,
"Kensington" : Urban_Fiction,
"Fodor's Travel Publications" : Travel,
"Marvel Entertainment, LLC" : Comics_Graphic_Novels,
}
genre_imprints = {
"Harlequin Intrigue" : Romantic_Suspense,
"Love Inspired Suspense" : Romantic_Suspense,
"Harlequin Historical" : Historical_Romance,
"Harlequin Historical Undone" : Historical_Romance,
"Frommers" : Travel,
"LucasBooks": Media_Tie_in_SF,
}
audience_imprints = {
"Harlequin Teen" : Classifier.AUDIENCE_YOUNG_ADULT,
"HarperTeen" : Classifier.AUDIENCE_YOUNG_ADULT,
"Open Road Media Teen & Tween" : Classifier.AUDIENCE_YOUNG_ADULT,
"Rosen Young Adult" : Classifier.AUDIENCE_YOUNG_ADULT,
}
not_adult_publishers = set([
"Scholastic Inc.",
"Random House Children's Books",
"Little, Brown Books for Young Readers",
"Penguin Young Readers Group",
"Hachette Children's Books",
"Nickelodeon Publishing",
])
not_adult_imprints = set([
"Scholastic",
"Scholastic Paperbacks",
"Random House Books for Young Readers",
"HMH Books for Young Readers",
"Knopf Books for Young Readers",
"Delacorte Books for Young Readers",
"Open Road Media Young Readers",
"Macmillan Young Listeners",
"Bloomsbury Childrens",
"NYR Children's Collection",
"Bloomsbury USA Childrens",
"National Geographic Children's Books",
])
fiction_imprints = set(["Del Rey"])
nonfiction_imprints = set(["Harlequin Nonfiction"])
nonfiction_publishers = set(["Wiley"])
fiction_publishers = set([])
def __init__(self, work, test_session=None, debug=False):
self._db = Session.object_session(work)
if test_session:
self._db = test_session
self.work = work
self.fiction_weights = Counter()
self.audience_weights = Counter()
self.target_age_lower_weights = Counter()
self.target_age_upper_weights = Counter()
self.genre_weights = Counter()
self.direct_from_license_source = set()
self.prepared = False
self.debug = debug
self.classifications = []
self.seen_classifications = set()
self.log = logging.getLogger("Classifier (workid=%d)" % self.work.id)
self.using_staff_genres = False
self.using_staff_fiction_status = False
self.using_staff_audience = False
self.using_staff_target_age = False
# Keep track of whether we've seen one of Overdrive's generic
# "Juvenile" classifications, as well as its more specific
# subsets like "Picture Books" and "Beginning Readers"
self.overdrive_juvenile_generic = False
self.overdrive_juvenile_with_target_age = False
[docs] def add(self, classification):
"""Prepare a single Classification for consideration."""
try:
from ..model import DataSource, Subject
except ValueError:
from model import DataSource, Subject
# We only consider a given classification once from a given
# data source.
key = (classification.subject, classification.data_source)
if key in self.seen_classifications:
return
self.seen_classifications.add(key)
if self.debug:
self.classifications.append(classification)
# Make sure the Subject is ready to be used in calculations.
if not classification.subject.checked: # or self.debug
classification.subject.assign_to_genre()
if classification.comes_from_license_source:
self.direct_from_license_source.add(classification)
else:
if classification.subject.describes_format:
# TODO: This is a bit of a hack.
#
# Only accept a classification having to do with
# format (e.g. 'comic books') if that classification
# comes direct from the license source. Otherwise it's
# really easy for a graphic adaptation of a novel to
# get mixed up with the original novel, whereupon the
# original book is classified as a graphic novel.
return
# Put the weight of the classification behind various
# considerations.
weight = classification.scaled_weight
subject = classification.subject
from_staff = classification.data_source.name == DataSource.LIBRARY_STAFF
# if classification is genre or NONE from staff, ignore all non-staff genres
is_genre = subject.genre != None
is_none = (from_staff and subject.type == Subject.SIMPLIFIED_GENRE and subject.identifier == SimplifiedGenreClassifier.NONE)
if is_genre or is_none:
if not from_staff and self.using_staff_genres:
return
if from_staff and not self.using_staff_genres:
# first encounter with staff genre, so throw out existing genre weights
self.using_staff_genres = True
self.genre_weights = Counter()
if is_genre:
self.weigh_genre(subject.genre, weight)
# if staff classification is fiction or nonfiction, ignore all other fictions
if not self.using_staff_fiction_status:
if from_staff and subject.type == Subject.SIMPLIFIED_FICTION_STATUS:
# encountering first staff fiction status,
# so throw out existing fiction weights
self.using_staff_fiction_status = True
self.fiction_weights = Counter()
self.fiction_weights[subject.fiction] += weight
# if staff classification is about audience, ignore all other audience classifications
if not self.using_staff_audience:
if from_staff and subject.type == Subject.FREEFORM_AUDIENCE:
self.using_staff_audience = True
self.audience_weights = Counter()
self.audience_weights[subject.audience] += weight
else:
if classification.generic_juvenile_audience:
# We have a generic 'juvenile' classification. The
# audience might say 'Children' or it might say 'Young
# Adult' but we don't actually know which it is.
#
# We're going to split the difference, with a slight
# preference for YA, to bias against showing
# age-inappropriate material to children. To
# counterbalance the fact that we're splitting up the
# weight this way, we're also going to treat this
# classification as evidence _against_ an 'adult'
# classification.
self.audience_weights[Classifier.AUDIENCE_YOUNG_ADULT] += (weight * 0.6)
self.audience_weights[Classifier.AUDIENCE_CHILDREN] += (weight * 0.4)
for audience in Classifier.AUDIENCES_ADULT:
if audience != Classifier.AUDIENCE_ALL_AGES:
# 'All Ages' is considered an adult audience,
# but a generic 'juvenile' classification
# is not evidence against it.
self.audience_weights[audience] -= weight * 0.5
else:
self.audience_weights[subject.audience] += weight
if not self.using_staff_target_age:
if from_staff and subject.type == Subject.AGE_RANGE:
self.using_staff_target_age = True
self.target_age_lower_weights = Counter()
self.target_age_upper_weights = Counter()
if subject.target_age:
# Figure out how reliable this classification really is as
# an indicator of a target age.
scaled_weight = classification.weight_as_indicator_of_target_age
target_min = subject.target_age.lower
target_max = subject.target_age.upper
if target_min is not None:
if not subject.target_age.lower_inc:
target_min += 1
self.target_age_lower_weights[target_min] += scaled_weight
if target_max is not None:
if not subject.target_age.upper_inc:
target_max -= 1
self.target_age_upper_weights[target_max] += scaled_weight
if not self.using_staff_audience and not self.using_staff_target_age:
if subject.type=='Overdrive' and subject.audience==Classifier.AUDIENCE_CHILDREN:
if subject.target_age and (
subject.target_age.lower or subject.target_age.upper
):
# This is a juvenile classification like "Picture
# Books" which implies a target age.
self.overdrive_juvenile_with_target_age = classification
else:
# This is a generic juvenile classification like
# "Juvenile Fiction".
self.overdrive_juvenile_generic = classification
[docs] def prepare_to_classify(self):
"""Called the first time classify() is called. Does miscellaneous
one-time prep work that requires all data to be in place.
"""
self.weigh_metadata()
explicitly_indicated_audiences = (
Classifier.AUDIENCE_CHILDREN,
Classifier.AUDIENCE_YOUNG_ADULT,
Classifier.AUDIENCE_ADULTS_ONLY)
audiences_from_license_source = set(
[classification.subject.audience
for classification in self.direct_from_license_source]
)
if (self.direct_from_license_source
and not self.using_staff_audience
and not any(
audience in explicitly_indicated_audiences
for audience in audiences_from_license_source
)):
# If this was erotica, or a book for children or young
# adults, the distributor would have given some indication
# of that fact. In the absense of any such indication, we
# can assume very strongly that this is a regular old book
# for adults.
#
# 3M is terrible at distinguishing between childrens'
# books and YA books, but books for adults can be
# distinguished by their _lack_ of childrens/YA
# classifications.
self.audience_weights[Classifier.AUDIENCE_ADULT] += 500
if (self.overdrive_juvenile_generic
and not self.overdrive_juvenile_with_target_age):
# This book is classified under 'Juvenile Fiction' but not
# under 'Picture Books' or 'Beginning Readers'. The
# implicit target age here is 9-12 (the portion of
# Overdrive's 'juvenile' age range not covered by 'Picture
# Books' or 'Beginning Readers'.
weight = self.overdrive_juvenile_generic.weight_as_indicator_of_target_age
self.target_age_lower_weights[9] += weight
self.target_age_upper_weights[12] += weight
self.prepared = True
[docs] def classify(self, default_fiction=None, default_audience=None):
# Do a little prep work.
if not self.prepared:
self.prepare_to_classify()
if self.debug:
for c in self.classifications:
self.log.debug(
"%d %r (via %s)", c.weight, c.subject, c.data_source.name
)
# Actually figure out the classifications
fiction = self.fiction(default_fiction=default_fiction)
genres = self.genres(fiction)
audience = self.audience(genres, default_audience=default_audience)
target_age = self.target_age(audience)
if self.debug:
self.log.debug("Fiction weights:")
for k, v in self.fiction_weights.most_common():
self.log.debug(" %s: %s", v, k)
self.log.debug("Genre weights:")
for k, v in self.genre_weights.most_common():
self.log.debug(" %s: %s", v, k)
self.log.debug("Audience weights:")
for k, v in self.audience_weights.most_common():
self.log.debug(" %s: %s", v, k)
return genres, fiction, audience, target_age
[docs] def fiction(self, default_fiction=None):
"""Is it more likely this is a fiction or nonfiction book?"""
if not self.fiction_weights:
# We have absolutely no idea one way or the other, and it
# would be irresponsible to guess.
return default_fiction
is_fiction = default_fiction
if self.fiction_weights[True] > self.fiction_weights[False]:
is_fiction = True
elif self.fiction_weights[False] > 0:
is_fiction = False
return is_fiction
[docs] def audience(self, genres=[], default_audience=None):
"""What's the most likely audience for this book?
:param default_audience: To avoid embarassing situations we will
classify works as being intended for adults absent convincing
evidence to the contrary. In some situations (like the metadata
wrangler), it's better to state that we have no information, so
default_audience can be set to None.
"""
# If we determined that Erotica was a significant enough
# component of the classification to count as a genre, the
# audience will always be 'Adults Only', even if the audience
# weights would indicate something else.
if Erotica in genres:
return Classifier.AUDIENCE_ADULTS_ONLY
w = self.audience_weights
if not self.audience_weights:
# We have absolutely no idea, and it would be
# irresponsible to guess.
return default_audience
children_weight = w.get(Classifier.AUDIENCE_CHILDREN, 0)
ya_weight = w.get(Classifier.AUDIENCE_YOUNG_ADULT, 0)
adult_weight = w.get(Classifier.AUDIENCE_ADULT, 0)
adults_only_weight = w.get(Classifier.AUDIENCE_ADULTS_ONLY, 0)
all_ages_weight = w.get(Classifier.AUDIENCE_ALL_AGES, 0)
research_weight = w.get(Classifier.AUDIENCE_RESEARCH, 0)
total_adult_weight = adult_weight + adults_only_weight
total_weight = sum(w.values())
audience = default_audience
# A book will be classified as a young adult or childrens'
# book when the weight of that audience is more than twice the
# combined weight of the 'adult' and 'adults only' audiences.
# If that combined weight is zero, then any amount of evidence
# is sufficient.
threshold = total_adult_weight * 2
# If both the 'children' weight and the 'YA' weight pass the
# threshold, we go with the one that weighs more.
# If the 'children' weight passes the threshold on its own
# we go with 'children'.
total_juvenile_weight = children_weight + ya_weight
if (research_weight > (total_adult_weight + all_ages_weight) and
research_weight > (total_juvenile_weight + all_ages_weight) and
research_weight > threshold):
audience = Classifier.AUDIENCE_RESEARCH
elif (all_ages_weight > total_adult_weight and
all_ages_weight > total_juvenile_weight):
audience = Classifier.AUDIENCE_ALL_AGES
elif children_weight > threshold and children_weight > ya_weight:
audience = Classifier.AUDIENCE_CHILDREN
elif ya_weight > threshold:
audience = Classifier.AUDIENCE_YOUNG_ADULT
elif total_juvenile_weight > threshold:
# Neither weight passes the threshold on its own, but
# combined they do pass the threshold. Go with
# 'Young Adult' to be safe.
audience = Classifier.AUDIENCE_YOUNG_ADULT
elif total_adult_weight > 0:
audience = Classifier.AUDIENCE_ADULT
# If the 'adults only' weight is more than 1/4 of the total adult
# weight, classify as 'adults only' to be safe.
#
# TODO: This has not been calibrated.
if (audience==Classifier.AUDIENCE_ADULT
and adults_only_weight > total_adult_weight/4):
audience = Classifier.AUDIENCE_ADULTS_ONLY
return audience
[docs] @classmethod
def top_tier_values(self, counter):
"""Given a Counter mapping values to their frequency of occurance,
return all values that are as common as the most common value.
"""
top_frequency = None
top_tier = set()
for age, freq in counter.most_common():
if not top_frequency:
top_frequency = freq
if freq != top_frequency:
# We've run out of candidates
break
else:
# This candidate occurs with the maximum frequency.
top_tier.add(age)
return top_tier
[docs] def target_age(self, audience):
"""Derive a target age from the gathered data."""
if audience not in (
Classifier.AUDIENCE_CHILDREN, Classifier.AUDIENCE_YOUNG_ADULT
):
# This is not a children's or YA book. Assertions about
# target age are irrelevant and the default value rules.
return Classifier.default_target_age_for_audience(audience)
# Only consider the most reliable classifications.
# Try to reach consensus on the lower and upper bounds of the
# age range.
if self.debug:
if self.target_age_lower_weights:
self.log.debug("Possible target age minima:")
for k, v in self.target_age_lower_weights.most_common():
self.log.debug(" %s: %s", v, k)
if self.target_age_upper_weights:
self.log.debug("Possible target age maxima:")
for k, v in self.target_age_upper_weights.most_common():
self.log.debug(" %s: %s", v, k)
target_age_min = None
target_age_max = None
if self.target_age_lower_weights:
# Find the youngest age in the top tier of values.
candidates = self.top_tier_values(self.target_age_lower_weights)
target_age_min = min(candidates)
if self.target_age_upper_weights:
# Find the oldest age in the top tier of values.
candidates = self.top_tier_values(self.target_age_upper_weights)
target_age_max = max(candidates)
if not target_age_min and not target_age_max:
# We found no opinions about target age. Use the default.
return Classifier.default_target_age_for_audience(audience)
if target_age_min is None:
target_age_min = target_age_max
if target_age_max is None:
target_age_max = target_age_min
# Err on the side of setting the minimum age too high.
if target_age_min > target_age_max:
target_age_max = target_age_min
return Classifier.range_tuple(target_age_min, target_age_max)
[docs] def genres(self, fiction, cutoff=0.15):
"""Consolidate genres and apply a low-pass filter."""
# Remove any genres whose fiction status is inconsistent with the
# (independently determined) fiction status of the book.
#
# It doesn't matter if a book is classified as 'science
# fiction' 100 times; if we know it's nonfiction, it can't be
# science fiction. (It's probably a history of science fiction
# or something.)
genres = dict(self.genre_weights)
if not genres:
# We have absolutely no idea, and it would be
# irresponsible to guess.
return {}
for genre in list(genres.keys()):
# If we have a fiction determination, that lets us eliminate
# possible genres that conflict with that determination.
#
# TODO: If we don't have a fiction determination, the
# genres we end up with may help us make one.
if fiction is not None and (genre.default_fiction != fiction):
del genres[genre]
# Consolidate parent genres into their heaviest subgenre.
genres = self.consolidate_genre_weights(genres)
total_weight = float(sum(genres.values()))
# Strip out the stragglers.
for g, score in list(genres.items()):
affinity = score / total_weight
if affinity < cutoff:
total_weight -= score
del genres[g]
return genres
[docs] def weigh_genre(self, genre_data, weight):
"""A helper method that ensure we always use database Genre
objects, not GenreData objects, when weighting genres.
"""
try:
from ..model import Genre
except ValueError:
from model import Genre
genre, ignore = Genre.lookup(self._db, genre_data.name)
self.genre_weights[genre] += weight
[docs] @classmethod
def consolidate_genre_weights(
cls, weights, subgenre_swallows_parent_at=0.03
):
"""If a genre and its subgenres both show up, examine the subgenre
with the highest weight. If its weight exceeds a certain
proportion of the weight of the parent genre, assign the
parent's weight to the subgenre and remove the parent.
"""
#print("Before consolidation:")
#for genre, weight in weights.items():
# print("", genre, weight)
# Convert Genre objects to GenreData.
consolidated = Counter()
for genre, weight in list(weights.items()):
if not isinstance(genre, GenreData):
genre = genres[genre.name]
consolidated[genre] += weight
heaviest_child = dict()
for genre, weight in list(consolidated.items()):
for parent in genre.parents:
if parent in consolidated:
if ((not parent in heaviest_child)
or weight > heaviest_child[parent][1]):
heaviest_child[parent] = (genre, weight)
#print("Heaviest child:")
#for parent, (genre, weight) in heaviest_child.items():
# print("", parent, genre, weight)
made_it = False
while not made_it:
for parent, (child, weight) in sorted(
heaviest_child.items(),
key=lambda genre: genre[1][1], reverse=True
):
parent_weight = consolidated.get(parent, 0)
if weight > (subgenre_swallows_parent_at * parent_weight):
consolidated[child] += parent_weight
del consolidated[parent]
changed = False
for parent in parent.parents:
if parent in heaviest_child:
heaviest_child[parent] = (child, consolidated[child])
changed = True
if changed:
# We changed the dict, so we need to restart
# the iteration.
break
# We made it all the way through the dict without changing it.
made_it = True
#print("Final heaviest child:")
#for parent, (genre, weight) in heaviest_child.items():
# print("", parent, genre, weight)
#print("After consolidation:")
#for genre, weight in consolidated.items():
# print("", genre, weight)
return consolidated
# Make a dictionary of classification schemes to classifiers.
Classifier.classifiers[Classifier.FREEFORM_AUDIENCE] = FreeformAudienceClassifier
Classifier.classifiers[Classifier.AXIS_360_AUDIENCE] = AgeOrGradeClassifier
# Finally, import classifiers described in submodules.
from .age import (
GradeLevelClassifier,
InterestLevelClassifier,
AgeClassifier,
)
from .bisac import BISACClassifier
from .rbdigital import (
RBDigitalAudienceClassifier,
RBDigitalSubjectClassifier,
)
from .ddc import DeweyDecimalClassifier
from .lcc import LCCClassifier
from .gutenberg import GutenbergBookshelfClassifier
from .bic import BICClassifier
from .simplified import (
SimplifiedFictionClassifier,
SimplifiedGenreClassifier,
)
from .overdrive import OverdriveClassifier
from .keyword import (
KeywordBasedClassifier,
LCSHClassifier,
FASTClassifier,
TAGClassifier,
Eg,
)