# encoding: utf-8
import csv
import os
import re
import string
from . import *
from .keyword import KeywordBasedClassifier
[docs]class CustomMatchToken(object):
"""A custom token used in matching rules."""
[docs] def matches(self, subject_token):
"""Does the given token match this one?"""
raise NotImplementedError()
[docs]class Something(CustomMatchToken):
"""A CustomMatchToken that will match any single token."""
[docs] def matches(self, subject_token):
return True
[docs]class RE(CustomMatchToken):
"""A CustomMatchToken that performs a regular expression search."""
def __init__(self, pattern):
self.re = re.compile(pattern, re.I)
[docs] def matches(self, subject_token):
return self.re.search(subject_token)
[docs]class Interchangeable(CustomMatchToken):
"""A token that matches a list of strings."""
def __init__(self, *choices):
"""All of these strings are interchangeable for matching purposes."""
self.choices = set([Lowercased(x) for x in choices])
[docs] def matches(self,subject_token):
return Lowercased(subject_token) in self.choices
# Special tokens for use in matching rules.
something = Something()
fiction = Interchangeable("Juvenile Fiction", "Young Adult Fiction", "Fiction")
juvenile = Interchangeable("Juvenile Fiction", "Juvenile Nonfiction")
ya = Interchangeable("Young Adult Fiction", "Young Adult Nonfiction")
# These need special code because they can modify the token stack.
anything = object()
nonfiction = object()
# These are BISAC categories that changed their names. We want to treat both
# names as equivalent. In most cases, the name change is cosmetic.
body_mind_spirit = Interchangeable("Body, Mind & Spirit", "Mind & Spirit")
psychology = Interchangeable("Psychology", "Psychology & Psychiatry")
technology = Interchangeable("Technology & Engineering", "Technology")
social_topics = Interchangeable("Social Situations", "Social Topics")
# This name change is _not_ cosmetic. The category was split into
# two, and we're putting everything that was in the old category into
# one of the two.
literary_criticism = Interchangeable(
"Literary Criticism", "Literary Criticism & Collections"
)
# If these variables are used in a rule, they must be the first token in
# that rule.
special_variables = { nonfiction : "nonfiction",
fiction : "fiction",
juvenile : "juvenile",
ya : "ya",}
[docs]class MatchingRule(object):
"""A rule that takes a list of subject parts and returns
an appropriate classification.
"""
def __init__(self, result, *ruleset):
if result is None:
raise ValueError(
"MatchingRule returns None on a non-match, it can't also return None on a match."
)
self.result = result
self.ruleset = []
# Track the subjects that were 'caught' by this rule,
# for debugging purposes.
self.caught = []
for i, rule in enumerate(ruleset):
if i > 0 and rule in special_variables:
raise ValueError(
"Special token '%s' must be the first in a ruleset."
% special_variables[rule]
)
if isinstance(rule, (bytes, str)):
# It's a string. We do case-insensitive comparisons,
# so lowercase it.
self.ruleset.append(Lowercased(rule))
else:
# It's a special object. Add it to the ruleset as-is.
self.ruleset.append(rule)
[docs] def match(self, *subject):
"""If `subject` matches this ruleset, return the appropriate
result. Otherwise, return None.
"""
# Create parallel lists of the subject and the things it has to
# match.
must_match = list(self.ruleset)
remaining_subject = list(subject)
# Consume tokens from both lists until we've confirmed no
# match or there is nothing left to match.
match_so_far = True
while match_so_far and must_match:
match_so_far, must_match, remaining_subject = self._consume(
must_match, remaining_subject
)
if match_so_far:
# Everything that had to match, did.
self.caught.append(subject)
return self.result
# Something that had to match, didn't.
return None
def _consume(self, rules, subject):
"""The first token (and possibly more) of the rules must match the
first token (and possibly more) of the subject.
All matched rule and subject tokens are consumed.
:return: A 3-tuple (could_match, new_rules, new_subject)
could_match is a boolean that is False if we now know that the
subject does not match the rule, and True if it might still
match the rule.
new_rules contains the tokens in the ruleset that have yet to
be activated.
new_subject contains the tokens in the subject that have yet
to be checked.
"""
if not rules:
# An empty ruleset matches everything.
return True, rules, subject
if not subject and rules != [anything]:
# Apart from [anything], no non-empty ruleset matches an
# empty subject.
return False, rules, subject
# Figure out which rule we'll be applying. We won't need it
# again, so we can remove it from the ruleset.
rule_token = rules.pop(0)
if rule_token == anything:
# This is the complicated one.
if not rules:
# If the final rule is 'anything', then that's redundant,
# but we can declare success and stop.
return True, rules, subject
# At this point we know that 'anything' is followed by some
# other rule token.
next_rule = rules.pop(0)
# We can consume as many subject tokens as necessary, but
# eventually a subject token must match this subsequent
# rule token.
while subject:
subject_token = subject.pop(0)
submatch, ignore1, ignore2 = self._consume(
[next_rule], [subject_token]
)
if submatch:
# We had to remove some number of subject tokens,
# but we found one that matches the next rule.
return True, rules, subject
else:
# That token didn't match, but maybe the next one will.
pass
# We went through the entire remaining subject and didn't
# find a match for the rule token that follows 'anything'.
return False, rules, subject
# We're comparing two individual tokens.
subject_token = subject.pop(0)
if isinstance(rule_token, CustomMatchToken):
match = rule_token.matches(subject_token)
elif rule_token == nonfiction:
# This is too complex to be a CustomMatchToken because
# we may be modifying the subject token list.
match = subject_token not in (
'juvenile fiction', 'young adult fiction', 'fiction'
)
if match and subject_token not in (
'juvenile nonfiction', 'young adult nonfiction'
):
# The implicit top-level lane is 'nonfiction',
# which means we popped a token like 'History' that
# needs to go back on the stack.
subject.insert(0, subject_token)
else:
# The strings must match exactly.
match = rule_token == subject_token
return match, rules, subject
[docs]def m(result, *ruleset):
"""Alias for the MatchingRule constructor with a short name."""
return MatchingRule(result, *ruleset)
[docs]class BISACClassifier(Classifier):
"""Handle real, genuine, according-to-Hoyle BISAC classifications.
Subclasses of this method can use the same basic classification logic
to classify classifications that are based on BISAC but have cosmetic
differences.
First, a BISAC code is mapped to its human-readable name.
Second, the name is split into parts (e.g. ["Fiction", "War &
Military"]).
To determine fiction status, audience, target age, or genre, the
list of name parts is compared against each of a list of matching
rules.
"""
# Map identifiers to human-readable names.
NAMES = dict(
[i.strip() for i in l]
for l in csv.reader(open(os.path.join(resource_dir, "bisac.csv")))
)
# Indicates that even though this rule doesn't match a subject, no
# further rules in the same category should be run on it, because they
# will lead to inaccurate information.
stop = object()
# If none of these rules match, a lane's fiction status depends on the
# genre assigned to it.
FICTION = [
m(True, "Fiction"),
m(True, "Juvenile Fiction"),
m(False, "Juvenile Nonfiction"),
m(True, "Young Adult Fiction"),
m(False, "Young Adult Nonfiction"),
m(False, anything, "Essays"),
m(False, anything, "Letters"),
m(True, "Literary Collections"),
m(stop, "Humor"),
m(stop, "Drama"),
m(stop, "Poetry"),
m(False, anything),
]
# In BISAC, juvenile fiction and YA fiction are kept in separate
# spaces. Nearly everything outside that space can be presumed to
# have AUDIENCE_ADULT.
AUDIENCE = [
m(Classifier.AUDIENCE_CHILDREN, "Bibles", anything, "Children"),
m(Classifier.AUDIENCE_CHILDREN, juvenile, anything),
m(Classifier.AUDIENCE_YOUNG_ADULT, ya, anything),
m(Classifier.AUDIENCE_YOUNG_ADULT, "Bibles", anything, "Youth & Teen"),
m(Classifier.AUDIENCE_ADULTS_ONLY, anything, "Erotica"),
m(Classifier.AUDIENCE_ADULTS_ONLY, "Humor", "Topic", "Adult"),
m(Classifier.AUDIENCE_ADULT, anything),
]
TARGET_AGE = [
m((0,4), juvenile, anything, "Readers", "Beginner") ,
m((5,7), juvenile, anything, "Readers", "Intermediate"),
m((5,7), juvenile, anything, "Early Readers"),
m((8,13), juvenile, anything, "Chapter Books")
]
GENRE = [
# Put all erotica in Erotica, to keep the other lanes at
# "Adult" level or lower.
m(Erotica, anything, 'Erotica'),
# Put all non-erotica comics into the same bucket, regardless
# of their content.
m(Comics_Graphic_Novels, 'Comics & Graphic Novels'),
m(Comics_Graphic_Novels, nonfiction, 'Comics & Graphic Novels'),
m(Comics_Graphic_Novels, fiction, 'Comics & Graphic Novels'),
# "Literary Criticism / Foo" implies Literary Criticism, not Foo.
m(Literary_Criticism, anything, literary_criticism),
# "Fiction / Christian / Foo" implies Religious Fiction
# more strongly than it implies Foo.
m(Religious_Fiction, fiction, anything, 'Christian'),
# "Fiction / Foo / Short Stories" implies Short Stories more
# strongly than it implies Foo. This assumes that a short
# story collection within a genre will also be classified
# separately under that genre. This could definitely be
# improved but would require a Subject to map to multiple
# Genres.
m(Short_Stories, fiction, anything, RE('^Anthologies')),
m(Short_Stories, fiction, anything, RE('^Short Stories')),
m(Short_Stories, 'Literary Collections'),
m(Short_Stories, fiction, anything, 'Collections & Anthologies'),
# Classify top-level fiction categories into fiction genres.
#
# First, handle large overarching genres that have subgenres
# and adjacent genres.
#
# Fantasy
m(Epic_Fantasy, fiction, 'Fantasy', 'Epic'),
m(Historical_Fantasy, fiction, 'Fantasy', 'Historical'),
m(Urban_Fantasy, fiction, 'Fantasy', 'Urban'),
m(Fantasy, fiction, 'Fantasy'),
m(Fantasy, fiction, 'Romance', 'Fantasy'),
m(Fantasy, fiction, 'Sagas'),
# Mystery
# n.b. no BISAC for Paranormal_Mystery
m(Crime_Detective_Stories, fiction, 'Mystery & Detective', 'Private Investigators'),
m(Crime_Detective_Stories, fiction, 'Crime'),
m(Crime_Detective_Stories, fiction, 'Thrillers', 'Crime'),
m(Hard_Boiled_Mystery, fiction, 'Mystery & Detective', 'Hard-Boiled'),
m(Police_Procedural, fiction, 'Mystery & Detective', 'Police Procedural'),
m(Cozy_Mystery, fiction, 'Mystery & Detective', 'Cozy'),
m(Historical_Mystery, fiction, 'Mystery & Detective', 'Historical'),
m(Women_Detectives, fiction, 'Mystery & Detective', 'Women Sleuths'),
m(Mystery, fiction, anything, 'Mystery & Detective'),
# Horror
m(Ghost_Stories, fiction, 'Ghost'),
m(Occult_Horror, fiction, 'Occult & Supernatural'),
m(Gothic_Horror, fiction, 'Gothic'),
m(Horror, fiction, 'Horror'),
# Romance
# n.b. no BISAC for Gothic Romance
m(Contemporary_Romance, fiction, 'Romance', 'Contemporary'),
m(Historical_Romance, fiction, 'Romance', 'Historical'),
m(Paranormal_Romance, fiction, 'Romance', 'Paranormal'),
m(Western_Romance, fiction, 'Romance', 'Western'),
m(Romantic_Suspense, fiction, 'Romance', 'Suspense'),
m(Romantic_SF, fiction, 'Romance', 'Time Travel'),
m(Romantic_SF, fiction, 'Romance', 'Science Fiction'),
m(Romance, fiction, 'Romance'),
# Science fiction
# n.b. no BISAC for Cyberpunk
m(Dystopian_SF, fiction, 'Dystopian'),
m(Space_Opera, fiction, 'Science Fiction', 'Space Opera'),
m(Military_SF, fiction, 'Science Fiction', 'Military'),
m(Alternative_History, fiction, 'Alternative History'),
# Juvenile steampunk is classified directly beneath 'fiction'.
m(Steampunk, fiction, anything, 'Steampunk'),
m(Science_Fiction, fiction, 'Science Fiction'),
# Thrillers
# n.b. no BISAC for Supernatural_Thriller
m(Historical_Thriller, fiction, 'Thrillers', 'Historical'),
m(Espionage, fiction, 'Thrillers', 'Espionage'),
m(Medical_Thriller, fiction, 'Thrillers', 'Medical'),
m(Political_Thriller, fiction, 'Thrillers', 'Political'),
m(Legal_Thriller, fiction, 'Thrillers', 'Legal'),
m(Technothriller, fiction, 'Thrillers', 'Technological'),
m(Military_Thriller, fiction, 'Thrillers', 'Military'),
m(Suspense_Thriller, fiction, 'Thrillers'),
# Then handle the less complicated genres of fiction.
m(Adventure, fiction, 'Action & Adventure'),
m(Adventure, fiction, 'Sea Stories'),
m(Adventure, fiction, 'War & Military'),
m(Classics, fiction, 'Classics'),
m(Folklore, fiction, 'Fairy Tales, Folk Tales, Legends & Mythology'),
m(Historical_Fiction, anything, 'Historical'),
m(Humorous_Fiction, fiction, 'Humorous'),
m(Humorous_Fiction, fiction, 'Satire'),
m(Literary_Fiction, fiction, 'Literary'),
m(LGBTQ_Fiction, fiction, 'Gay'),
m(LGBTQ_Fiction, fiction, 'Lesbian'),
m(LGBTQ_Fiction, fiction, 'Gay & Lesbian'),
m(Religious_Fiction, fiction, 'Religious'),
m(Religious_Fiction, fiction, 'Jewish'),
m(Religious_Fiction, fiction, 'Visionary & Metaphysical'),
m(Womens_Fiction, fiction, anything, 'Contemporary Women'),
m(Westerns, fiction, 'Westerns'),
# n.b. BISAC "Fiction / Urban" is distinct from "Fiction /
# African-American / Urban", and does not map to any of our
# genres.
m(Urban_Fiction, fiction, 'African American', 'Urban'),
# BISAC classifies these genres at the top level, which we
# treat as 'nonfiction', but we classify them as fiction. It
# doesn't matter because they're neither, really.
m(Drama, nonfiction, 'Drama'),
m(Poetry, nonfiction, 'Poetry'),
# Now on to nonfiction.
# Classify top-level nonfiction categories into fiction genres.
#
# First, handle large overarching genres that have subgenres
# and adjacent genres.
#
# Art & Design
m(Architecture, nonfiction, 'Architecture'),
m(Art_Criticism_Theory, nonfiction, 'Art', 'Criticism & Theory'),
m(Art_History, nonfiction, 'Art', 'History'),
m(Fashion, nonfiction, 'Design', 'Fashion'),
m(Design, nonfiction, 'Design'),
m(Art_Design, nonfiction, 'Art'),
m(Photography, nonfiction, 'Photography'),
# Personal Finance & Business
m(Business, nonfiction, 'Business & Economics', RE('^Business.*')),
m(Business, nonfiction, 'Business & Economics', 'Accounting'),
m(Economics, nonfiction, 'Business & Economics', 'Economics'),
m(Economics, nonfiction, 'Business & Economics', 'Environmental Economics'),
m(Economics, nonfiction, 'Business & Economics', RE('^Econo.*')),
m(Management_Leadership, nonfiction, 'Business & Economics', 'Management'),
m(Management_Leadership, nonfiction, 'Business & Economics', 'Management Science'),
m(Management_Leadership, nonfiction, 'Business & Economics', 'Leadership'),
m(Personal_Finance_Investing, nonfiction, 'Business & Economics', 'Personal Finance'),
m(Personal_Finance_Investing, nonfiction, 'Business & Economics', 'Personal Success'),
m(Personal_Finance_Investing, nonfiction, 'Business & Economics', 'Investments & Securities'),
m(Real_Estate, nonfiction, 'Business & Economics', 'Real Estate'),
m(Personal_Finance_Business, nonfiction, 'Business & Economics'),
# Parenting & Family
m(Parenting, nonfiction, 'Family & Relationships', 'Parenting'),
m(Family_Relationships, nonfiction, 'Family & Relationships'),
# Food & Health
m(Bartending_Cocktails, nonfiction, 'Cooking', 'Beverages'),
m(Health_Diet, nonfiction, 'Cooking', 'Health & Healing'),
m(Health_Diet, nonfiction, 'Health & Fitness'),
m(Vegetarian_Vegan, nonfiction, 'Cooking', 'Vegetarian & Vegan'),
m(Cooking, nonfiction, 'Cooking'),
# History
m(African_History, nonfiction, 'History', 'Africa'),
m(Ancient_History, nonfiction, 'History', 'Ancient'),
m(Asian_History, nonfiction, 'History', 'Asia'),
m(Civil_War_History, nonfiction, 'History', 'United States', RE('^Civil War')),
m(European_History, nonfiction, 'History', 'Europe'),
m(Latin_American_History, nonfiction, 'History', 'Latin America'),
m(Medieval_History, nonfiction, 'History', 'Medieval'),
m(Military_History, nonfiction, 'History', 'Military'),
m(Renaissance_Early_Modern_History, nonfiction, 'History', 'Renaissance'),
m(Renaissance_Early_Modern_History, nonfiction, 'History', 'Modern', RE('^1[678]th Century')),
m(Modern_History, nonfiction, 'History', 'Modern'),
m(United_States_History, nonfiction, 'History', 'Native American'),
m(United_States_History, nonfiction, 'History', 'United States'),
m(World_History, nonfiction, 'History', 'World'),
m(World_History, nonfiction, 'History', 'Civilization'),
m(History, nonfiction, 'History'),
# Hobbies & Home
m(Antiques_Collectibles, nonfiction, 'Antiques & Collectibles'),
m(Crafts_Hobbies, nonfiction, 'Crafts & Hobbies'),
m(Gardening, nonfiction, 'Gardening'),
m(Games, nonfiction, 'Games'),
m(House_Home, nonfiction, 'House & Home'),
m(Pets, nonfiction, 'Pets'),
# Entertainment
m(Film_TV, nonfiction, 'Performing Arts', 'Film & Video'),
m(Film_TV, nonfiction, 'Performing Arts', 'Television'),
m(Music, nonfiction, 'Music'),
m(Performing_Arts, nonfiction, 'Performing Arts'),
# Reference & Study Aids
m(Dictionaries, nonfiction, 'Reference', 'Dictionaries'),
m(Foreign_Language_Study, nonfiction, 'Foreign Language Study'),
m(Law, nonfiction, 'Law'),
m(Study_Aids, nonfiction, 'Study Aids'),
m(Reference_Study_Aids, nonfiction, 'Reference'),
m(Reference_Study_Aids, nonfiction, 'Language Arts & Disciplines'),
# Religion & Spirituality
m(Body_Mind_Spirit, nonfiction, body_mind_spirit),
m(Buddhism, nonfiction, 'Religion', 'Buddhism'),
m(Christianity, nonfiction, 'Religion', RE('^Biblical')),
m(Christianity, nonfiction, 'Religion', RE('^Christian')),
m(Christianity, nonfiction, 'Bibles'),
m(Hinduism, nonfiction, 'Religion', 'Hinduism'),
m(Islam, nonfiction, 'Religion', 'Islam'),
m(Judaism, nonfiction, 'Religion', 'Judaism'),
m(Religion_Spirituality, nonfiction, 'Religion'),
# Science & Technology
m(Computers, nonfiction, 'Computers'),
m(Mathematics, nonfiction, 'Mathematics'),
m(Medical, nonfiction, 'Medical'),
m(Nature, nonfiction, 'Nature'),
m(Psychology, nonfiction, psychology),
m(Political_Science, nonfiction, 'Social Science', 'Politics & Government'),
m(Social_Sciences, nonfiction, 'Social Science'),
m(Technology, nonfiction, technology),
m(Technology, nonfiction, 'Transportation'),
m(Science, nonfiction, 'Science'),
# Then handle the less complicated genres of nonfiction.
# n.b. no BISAC for Periodicals.
# n.b. no BISAC for Humorous Nonfiction per se.
m(Music, nonfiction, 'Biography & Autobiography', 'Composers & Musicians'),
m(Entertainment, nonfiction, 'Biography & Autobiography', 'Entertainment & Performing Arts'),
m(Biography_Memoir, nonfiction, 'Biography & Autobiography'),
m(Education, nonfiction, "Education"),
m(Philosophy, nonfiction, 'Philosophy'),
m(Political_Science, nonfiction, 'Political Science'),
m(Self_Help, nonfiction, 'Self-Help'),
m(Sports, nonfiction, 'Sports & Recreation'),
m(Travel, nonfiction, 'Travel'),
m(True_Crime, nonfiction, 'True Crime'),
# Handle cases where Juvenile/YA uses different terms than
# would be used for the same books for adults.
m(Business, nonfiction, 'Careers'),
m(Christianity, nonfiction, "Religious", "Christian"),
m(Cooking, nonfiction, "Cooking & Food"),
m(Education, nonfiction, "School & Education"),
m(Family_Relationships, nonfiction, "Family"),
m(Fantasy, fiction, "Fantasy & Magic"),
m(Ghost_Stories, fiction, 'Ghost Stories'),
m(Fantasy, fiction, 'Magical Realism'),
m(Fantasy, fiction, 'Mermaids'),
m(Fashion, nonfiction, 'Fashion'),
m(Folklore, fiction, "Fairy Tales & Folklore"),
m(Folklore, fiction, "Legends, Myths, Fables"),
m(Games, nonfiction, "Games & Activities"),
m(Health_Diet, nonfiction, "Health & Daily Living"),
m(Horror, fiction, "Horror & Ghost Stories"),
m(Horror, fiction, "Monsters"),
m(Horror, fiction, "Paranormal"),
m(Horror, fiction, 'Paranormal, Occult & Supernatural'),
m(Horror, fiction, 'Vampires'),
m(Horror, fiction, 'Werewolves & Shifters'),
m(Horror, fiction, 'Zombies'),
m(Humorous_Fiction, fiction, "Humorous Stories"),
m(Humorous_Nonfiction, "Young Adult Nonfiction", "Humor"),
m(LGBTQ_Fiction, fiction, 'LGBT'),
m(Law, nonfiction, "Law & Crime"),
m(Mystery, fiction, "Mysteries & Detective Stories"),
m(Nature, nonfiction, "Animals"),
m(Personal_Finance_Investing, nonfiction, 'Personal Finance'),
m(Poetry, fiction, "Nursery Rhymes"),
m(Poetry, fiction, "Stories in Verse"),
m(Poetry, fiction, 'Novels in Verse'),
m(Poetry, fiction, 'Poetry'),
m(Reference_Study_Aids, nonfiction, "Language Arts"),
m(Romance, fiction, "Love & Romance"),
m(Science_Fiction, fiction, "Robots"),
m(Science_Fiction, fiction, "Time Travel"),
m(Social_Sciences, nonfiction, "Media Studies"),
m(Suspense_Thriller, fiction, 'Superheroes'),
m(Suspense_Thriller, fiction, 'Thrillers & Suspense'),
# Most of the subcategories of 'Science & Nature' go into Nature,
# but these go into Science.
m(Science, nonfiction, 'Science & Nature', 'Discoveries'),
m(Science, nonfiction, 'Science & Nature', 'Experiments & Projects'),
m(Science, nonfiction, 'Science & Nature', 'History of Science'),
m(Science, nonfiction, 'Science & Nature', 'Physics'),
m(Science, nonfiction, 'Science & Nature', 'Weights & Measures'),
m(Science, nonfiction, 'Science & Nature', 'General'),
# Any other subcategory of 'Science & Nature' goes under Nature
m(Nature, nonfiction, 'Science & Nature', something),
# Life Strategies is juvenile/YA-specific, and contains both
# fiction and nonfiction. It's called "Social Issues" for
# juvenile fiction/nonfiction, and "Social Topics" for YA
# nonfiction. "Social Themes" in YA fiction is _not_
# classified as Life Strategies.
m(Life_Strategies, fiction, "social issues"),
m(Life_Strategies, nonfiction, "social issues"),
m(Life_Strategies, nonfiction, social_topics),
]
[docs] @classmethod
def is_fiction(cls, identifier, name):
for ruleset in cls.FICTION:
fiction = ruleset.match(*name)
if fiction is cls.stop:
return None
if fiction is not None:
return fiction
keyword = "/".join(name)
return KeywordBasedClassifier.is_fiction(identifier, keyword)
[docs] @classmethod
def audience(cls, identifier, name):
for ruleset in cls.AUDIENCE:
audience = ruleset.match(*name)
if audience is cls.stop:
return None
if audience is not None:
return audience
keyword = "/".join(name)
return KeywordBasedClassifier.audience(identifier, keyword)
[docs] @classmethod
def target_age(cls, identifier, name):
for ruleset in cls.TARGET_AGE:
target_age = ruleset.match(*name)
if target_age is cls.stop:
return None
if target_age is not None:
return target_age
# If all else fails, try the keyword-based classifier.
keyword = "/".join(name)
return KeywordBasedClassifier.target_age(identifier, keyword)
[docs] @classmethod
def genre(cls, identifier, name, fiction, audience):
for ruleset in cls.GENRE:
genre = ruleset.match(*name)
if genre is cls.stop:
return None
if genre is not None:
return genre
# If all else fails, try a keyword-based classifier.
keyword = "/".join(name)
return KeywordBasedClassifier.genre(
identifier, keyword, fiction, audience
)
# A BISAC name copied from the BISAC website may end with this
# human-readable note, which is not part of the official name.
see_also = re.compile('\(see also .*')
[docs] @classmethod
def scrub_identifier(cls, identifier):
if not identifier:
return identifier
if identifier.startswith('FB'):
identifier = identifier[2:]
if identifier in cls.NAMES:
# We know the canonical name for this BISAC identifier,
# and we are better equipped to classify the canonical
# names, so use the canonical name in preference to
# whatever name the distributor provided.
return (identifier, cls.NAMES[identifier])
return identifier
[docs] @classmethod
def scrub_name(cls, name):
"""Split the name into a list of lowercase keywords."""
# All of our comparisons are case-insensitive.
name = Lowercased(name)
# Take corrective action to finame a number of common problems
# seen in the wild.
#
# A comma may have been replaced with a space.
name = name.replace(" ", ", ")
# The name may be enclosed in an extra set of quotes.
for quote in ("'\""):
if name.startswith(quote):
name = name[1:]
if name.endswith(quote):
name = name[:-1]
# The name may end with an extraneous marker character or
# (if it was copied from the BISAC website) an asterisk.
for separator in '|/*':
if name.endswith(separator):
name = name[:-1]
# A name copied from the BISAC website may end with a
# human-readable cross-reference.
name = cls.see_also.sub('', name)
# The canonical separator character is a slash, but a pipe
# has also been used.
for separator in '|/':
if separator in name:
parts = [name.strip() for name in name.split(separator)
if name.strip()]
break
else:
parts = [name]
return parts
Classifier.classifiers[Classifier.BISAC] = BISACClassifier