Python

Search

Summarize text using LED huggingface

class TransformersTextSummarizer(BaseTextSummarizer):
    def __init__ (self, model_key, language):
        self._tokenizer = AutoTokenizer.from_pretrained(model_key)

        self._language = language

        self._model = AutoModelForSeq2SeqLM.from_pretrained(model_key)

        self._device = 'cuda' if bool(strtobool(os.getenv('USE_GPU'))) else 'cpu'

    def __chunk_text(self, text):
        sentences = [ s + ' ' for s in sentence_segmentation(text, minimum_n_words_to_accept_sentence=1, language=self._language) ]

        chunks = []

        chunk = ''

        length = 0

        for sentence in sentences:
            tokenized_sentence = self._tokenizer.encode(sentence, truncation=False, max_length=None, return_tensors='pt') [0]

            if len(tokenized_sentence) > self._tokenizer.model_max_length:
                continue

            length += len(tokenized_sentence)

            if length <= self._tokenizer.model_max_length:
                chunk = chunk + sentence
            else:
                chunks.append(chunk.strip())
                chunk = sentence
                length = len(tokenized_sentence)

        if len(chunk) > 0:
            chunks.append(chunk.strip())

        return chunks

    def __clean_text(self, text):
      if text.count('.') == 0:
        return text.strip()

      end_index = text.rindex('.') + 1

      return text[0 : end_index].strip()

    def summarize(self, text, *args, **kwargs):
        chunk_texts = self.__chunk_text(text)

        chunk_summaries = []

        for chunk_text in chunk_texts:
            input_tokenized = self._tokenizer.encode(chunk_text, return_tensors='pt')

            input_tokenized = input_tokenized.to(self._device)

            summary_ids = self._model.to(self._device).generate(input_tokenized, length_penalty=3.0, min_length = int(0.2 * len(chunk_text)), max_length = int(0.3 * len(chunk_text)), early_stopping=True, num_beams=5, no_repeat_ngram_size=2)

            output = [self._tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in summary_ids]

            chunk_summaries.append(output)

        summaries = [ self.__clean_text(text) for chunk_summary in chunk_summaries for text in chunk_summary ]

        return summaries

Comment

PREVIOUS	NEXT

Code Example
Python :: crop a video opencv
Python :: pade python
Python :: libraries used in ANN with Keras Sequential Model
Python :: geopandas change dtype of a columns
Python :: How to get the positions where values of two columns match?
Python :: how to keep old content when using write in python
Python :: Image loader RGB transform
Python :: unpad zeros from string python
Python :: python dataset createdimension unlimited
Python :: python cgi get raw post data
Python :: python add new line from textarea
Python :: np.argmax python could not be evaluated
Python :: ec2 ssh terminal hangs after sometime
Python :: spooling in os
Python :: how to write string in python
Python :: python round and map function
Python :: panda3d attach to bone
Python :: generate fibonacci series in python
Python :: copy any files from one folder to another folder in python
Python :: Left fill with zeros
Python :: how to execute queries with cxoracle python
Python :: find-squares-and-odd-numbers-in-the-given-list
Python :: np.conjugate
Python :: how to join models from another app
Python :: flask event source
Python :: python urllib.request.urlretrieve with a progressbar
Python :: space separated dictionary input in python
Python :: matplotlib get colorwheel
Python :: FizzBuzz in Python Using Conditional Statements
Python :: looping emails using a database with python code

Search

PYTHON

Summarize text using LED huggingface

ADD CONTENT