Source code for core.util.summary

import logging
from textblob import TextBlob
from textblob.exceptions import MissingCorpusError
from collections import Counter

from . import (
    Bigrams,
    english_bigrams,
)
import re

[docs]class SummaryEvaluator(object):

    """Evaluate summaries of a book to find a usable summary.

    A usable summary will have good coverage of the popular noun
    phrases found across all summaries of the book, will have an
    approximate length of four sentences (this is customizable), and
    will not mention words that indicate it's a summary of a specific
    edition of the book.

    All else being equal, a shorter summary is better.

    A summary is penalized for apparently not being in English.
    """

    # These phrases are indicative of a description we can't use for
    # whatever reason.
    default_bad_phrases = set([
        "version of",
        "retelling of",
        "abridged",
        "retelling",
        "condensed",
        "adaptation of",
        "look for",
        "new edition",
        "excerpts",
        "version",
        "edition",
        "selections",
        "complete texts",
        "in one volume",
        "contains",
        "--container",
        "--original container",
        "playaway",
        "complete novels",
        "all rights reserved",
    ])

    bad_res = set([
        re.compile("the [^ ]+ Collection"),
        re.compile("Includes"),
        re.compile("This is"),
    ])

    _nltk_installed = True
    log = logging.getLogger("Summary Evaluator")

    def __init__(self, optimal_number_of_sentences=4,
                 noun_phrases_to_consider=10, bad_phrases=None):
        self.optimal_number_of_sentences=optimal_number_of_sentences
        self.summaries = []
        self.noun_phrases = Counter()
        self.blobs = dict()
        self.scores = dict()
        self.noun_phrases_to_consider = float(noun_phrases_to_consider)
        self.top_noun_phrases = None
        if bad_phrases is None:
            self.bad_phrases = self.default_bad_phrases
        else:
            self.bad_phrases = bad_phrases

[docs]    def add(self, summary, parser=None):
        parser_class = parser or TextBlob
        if isinstance(summary, bytes):
            summary = summary.decode("utf8")
        if summary in self.blobs:
            # We already evaluated this summary. Don't count it more than once
            return
        blob = parser_class(summary)
        self.blobs[summary] = blob
        self.summaries.append(summary)

        if self._nltk_installed:
            try:
                for phrase in blob.noun_phrases:
                    self.noun_phrases[phrase] = self.noun_phrases[phrase] + 1
            except MissingCorpusError as e:
                self._nltk_installed = False
                self.log.error("Summary cannot be evaluated: NLTK not installed %r" % e)

[docs]    def ready(self):
        """We are done adding to the corpus and ready to start evaluating."""
        self.top_noun_phrases = set([
            k for k, v in self.noun_phrases.most_common(
                int(self.noun_phrases_to_consider))])

[docs]    def best_choice(self):
        c = self.best_choices(1)
        if c:
            return c[0]
        else:
            return None, None

[docs]    def best_choices(self, n=3):
        """Choose the best `n` choices among the current summaries."""
        scores = Counter()
        for summary in self.summaries:
            scores[summary] = self.score(summary)
        return scores.most_common(n)

[docs]    def score(self, summary, apply_language_penalty=True):
        """Score a summary relative to our current view of the dataset."""
        if not self._nltk_installed:
            # Without NLTK, there's no need to evaluate the score.
            return 1

        if isinstance(summary, bytes):
            summary = summary.decode("utf8")
        if summary in self.scores:
            return self.scores[summary]
        score = 1
        blob = self.blobs[summary]

        top_noun_phrases_used = len(
            [p for p in self.top_noun_phrases if p in blob.noun_phrases])
        score = 1 * (top_noun_phrases_used/self.noun_phrases_to_consider)

        try:
            sentences = len(blob.sentences)
        except Exception as e:
            # Can't parse into sentences for whatever reason.
            # Make a really bad guess.
            sentences = summary.count(". ") + 1
        off_from_optimal = abs(sentences-self.optimal_number_of_sentences)
        if off_from_optimal == 1:
            off_from_optimal = 1.5
        if off_from_optimal:
            # This summary is too long or too short.
            score /= (off_from_optimal ** 1.5)

        bad_phrases = 0
        l = summary.lower()
        for i in self.bad_phrases:
            if i in l:
                bad_phrases += 1

        for i in self.bad_res:
            if i.search(summary):
                bad_phrases += 1

        if l.count(" -- ") > 3:
            bad_phrases += (l.count(" -- ") - 3)

        score *= (0.5 ** bad_phrases)

        if apply_language_penalty:
            language_difference = english_bigrams.difference_from(
                Bigrams.from_string(summary))
            if language_difference > 1:
                score *= (0.5 ** (language_difference-1))

        return score
Source code for core.util.summary

Library Simplified Circulation Manager

Navigation

Related Topics