Source code for core.util.summary

import logging
from textblob import TextBlob
from textblob.exceptions import MissingCorpusError
from collections import Counter

from . import (
    Bigrams,
    english_bigrams,
)
import re

[docs]class SummaryEvaluator(object): """Evaluate summaries of a book to find a usable summary. A usable summary will have good coverage of the popular noun phrases found across all summaries of the book, will have an approximate length of four sentences (this is customizable), and will not mention words that indicate it's a summary of a specific edition of the book. All else being equal, a shorter summary is better. A summary is penalized for apparently not being in English. """ # These phrases are indicative of a description we can't use for # whatever reason. default_bad_phrases = set([ "version of", "retelling of", "abridged", "retelling", "condensed", "adaptation of", "look for", "new edition", "excerpts", "version", "edition", "selections", "complete texts", "in one volume", "contains", "--container", "--original container", "playaway", "complete novels", "all rights reserved", ]) bad_res = set([ re.compile("the [^ ]+ Collection"), re.compile("Includes"), re.compile("This is"), ]) _nltk_installed = True log = logging.getLogger("Summary Evaluator") def __init__(self, optimal_number_of_sentences=4, noun_phrases_to_consider=10, bad_phrases=None): self.optimal_number_of_sentences=optimal_number_of_sentences self.summaries = [] self.noun_phrases = Counter() self.blobs = dict() self.scores = dict() self.noun_phrases_to_consider = float(noun_phrases_to_consider) self.top_noun_phrases = None if bad_phrases is None: self.bad_phrases = self.default_bad_phrases else: self.bad_phrases = bad_phrases
[docs] def add(self, summary, parser=None): parser_class = parser or TextBlob if isinstance(summary, bytes): summary = summary.decode("utf8") if summary in self.blobs: # We already evaluated this summary. Don't count it more than once return blob = parser_class(summary) self.blobs[summary] = blob self.summaries.append(summary) if self._nltk_installed: try: for phrase in blob.noun_phrases: self.noun_phrases[phrase] = self.noun_phrases[phrase] + 1 except MissingCorpusError as e: self._nltk_installed = False self.log.error("Summary cannot be evaluated: NLTK not installed %r" % e)
[docs] def ready(self): """We are done adding to the corpus and ready to start evaluating.""" self.top_noun_phrases = set([ k for k, v in self.noun_phrases.most_common( int(self.noun_phrases_to_consider))])
[docs] def best_choice(self): c = self.best_choices(1) if c: return c[0] else: return None, None
[docs] def best_choices(self, n=3): """Choose the best `n` choices among the current summaries.""" scores = Counter() for summary in self.summaries: scores[summary] = self.score(summary) return scores.most_common(n)
[docs] def score(self, summary, apply_language_penalty=True): """Score a summary relative to our current view of the dataset.""" if not self._nltk_installed: # Without NLTK, there's no need to evaluate the score. return 1 if isinstance(summary, bytes): summary = summary.decode("utf8") if summary in self.scores: return self.scores[summary] score = 1 blob = self.blobs[summary] top_noun_phrases_used = len( [p for p in self.top_noun_phrases if p in blob.noun_phrases]) score = 1 * (top_noun_phrases_used/self.noun_phrases_to_consider) try: sentences = len(blob.sentences) except Exception as e: # Can't parse into sentences for whatever reason. # Make a really bad guess. sentences = summary.count(". ") + 1 off_from_optimal = abs(sentences-self.optimal_number_of_sentences) if off_from_optimal == 1: off_from_optimal = 1.5 if off_from_optimal: # This summary is too long or too short. score /= (off_from_optimal ** 1.5) bad_phrases = 0 l = summary.lower() for i in self.bad_phrases: if i in l: bad_phrases += 1 for i in self.bad_res: if i.search(summary): bad_phrases += 1 if l.count(" -- ") > 3: bad_phrases += (l.count(" -- ") - 3) score *= (0.5 ** bad_phrases) if apply_language_penalty: language_difference = english_bigrams.difference_from( Bigrams.from_string(summary)) if language_difference > 1: score *= (0.5 ** (language_difference-1)) return score