Search
 
SCRIPT & CODE EXAMPLE
 

PYTHON

Summarize text using LED huggingface

class TransformersTextSummarizer(BaseTextSummarizer):
    def __init__ (self, model_key, language):
        self._tokenizer = AutoTokenizer.from_pretrained(model_key)

        self._language = language

        self._model = AutoModelForSeq2SeqLM.from_pretrained(model_key)

        self._device = 'cuda' if bool(strtobool(os.getenv('USE_GPU'))) else 'cpu'

    def __chunk_text(self, text):
        sentences = [ s + ' ' for s in sentence_segmentation(text, minimum_n_words_to_accept_sentence=1, language=self._language) ]

        chunks = []

        chunk = ''

        length = 0

        for sentence in sentences:
            tokenized_sentence = self._tokenizer.encode(sentence, truncation=False, max_length=None, return_tensors='pt') [0]

            if len(tokenized_sentence) > self._tokenizer.model_max_length:
                continue

            length += len(tokenized_sentence)

            if length <= self._tokenizer.model_max_length:
                chunk = chunk + sentence
            else:
                chunks.append(chunk.strip())
                chunk = sentence
                length = len(tokenized_sentence)

        if len(chunk) > 0:
            chunks.append(chunk.strip())

        return chunks

    def __clean_text(self, text):
      if text.count('.') == 0:
        return text.strip()

      end_index = text.rindex('.') + 1

      return text[0 : end_index].strip()

    def summarize(self, text, *args, **kwargs):
        chunk_texts = self.__chunk_text(text)

        chunk_summaries = []

        for chunk_text in chunk_texts:
            input_tokenized = self._tokenizer.encode(chunk_text, return_tensors='pt')

            input_tokenized = input_tokenized.to(self._device)

            summary_ids = self._model.to(self._device).generate(input_tokenized, length_penalty=3.0, min_length = int(0.2 * len(chunk_text)), max_length = int(0.3 * len(chunk_text)), early_stopping=True, num_beams=5, no_repeat_ngram_size=2)

            output = [self._tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in summary_ids]

            chunk_summaries.append(output)

        summaries = [ self.__clean_text(text) for chunk_summary in chunk_summaries for text in chunk_summary ]

        return summaries
Comment

PREVIOUS NEXT
Code Example
Python :: crop a video opencv 
Python :: pade python 
Python :: libraries used in ANN with Keras Sequential Model 
Python :: geopandas change dtype of a columns 
Python :: How to get the positions where values of two columns match? 
Python :: how to keep old content when using write in python 
Python :: Image loader RGB transform 
Python :: unpad zeros from string python 
Python :: python dataset createdimension unlimited 
Python :: python cgi get raw post data 
Python :: python add new line from textarea 
Python :: np.argmax python could not be evaluated 
Python :: ec2 ssh terminal hangs after sometime 
Python :: spooling in os 
Python :: how to write string in python 
Python :: python round and map function 
Python :: panda3d attach to bone 
Python :: generate fibonacci series in python 
Python :: copy any files from one folder to another folder in python 
Python :: Left fill with zeros 
Python :: how to execute queries with cxoracle python 
Python :: find-squares-and-odd-numbers-in-the-given-list 
Python :: np.conjugate 
Python :: how to join models from another app 
Python :: flask event source 
Python :: python urllib.request.urlretrieve with a progressbar 
Python :: space separated dictionary input in python 
Python :: matplotlib get colorwheel 
Python :: FizzBuzz in Python Using Conditional Statements 
Python :: looping emails using a database with python code 
ADD CONTENT
Topic
Content
Source link
Name
4+3 =