>>> tokens = word_tokenize(raw) >>> type(tokens) <class 'list'> >>> len(tokens) 254354 >>> tokens[:10] ['The', 'Project', 'Gutenberg', 'EBook', 'of', 'Crime', 'and', 'Punishment', ',', 'by']