Search
 
SCRIPT & CODE EXAMPLE
 
CODE EXAMPLE FOR PYTHON

Filter pandas DataFrame by substring criteria

df[df['A'].str.contains("hello")]
df[df["A"].str.contains("Hello|Britain")]
df[df['A'].str.contains("Hello|Britain")==True]
#Here is an example of regex-based search,
# find rows in `df1` which contain "foo" followed by something
df1[df1['col'].str.contains(r'foo(?!$)')]
#Sometimes regex search is not required, so specify regex=False to disable it.
#select all rows containing "foo"
df1[df1['col'].str.contains('foo', regex=False)]
#Performance wise, regex search is slower than substring search:
s.str.contains('foo|bar', na=False) #if NaN in column(s) values
#How do I apply this to multiple columns at once?
# `axis=1` tells `apply` to apply the lambda function column-wise.
df.apply(lambda col: col.str.contains('foo|bar', na=False), axis=1)
#Multiple Substring Search
df4[df4['col'].str.contains(r'foo|baz')]
#OR
terms = ['foo', 'baz']
df4[df4['col'].str.contains('|'.join(terms))]
#Sometimes, it is wise to escape your terms in case they have characters 
#that can be interpreted as regex metacharacters. If your terms contain any 
#of the following characters...[. ^ $ * + ? { } [ ]  | ( )]
import re
df4[df4['col'].str.contains('|'.join(map(re.escape, terms)))]
#re.escape has the effect of escaping the special characters so they're treated literally.
#Matching Entire Word(s)
df3 = pd.DataFrame({'col': ['the sky is blue', 'bluejay by the window']})
df3
df3[df3['col'].str.contains('blue')]
#v/s
df3[df3['col'].str.contains(r'blue')]
# Use list comprehension
df1[['foo' in x for x in df1['col']]]
#instead of
regex_pattern = r'foo(?!$)'
df1[df1['col'].str.contains(regex_pattern)]
#OR
p = re.compile(regex_pattern, flags=re.IGNORECASE)
df1[[bool(p.search(x)) for x in df1['col']]]
#If "col" has NaNs, then instead of
df1[df1['col'].str.contains(regex_pattern, na=False)]
#OR
def try_search(p, x):
    try:
        return bool(p.search(x))
    except TypeError:
        return False

p = re.compile(regex_pattern)
df1[[try_search(p, x) for x in df1['col']]]
#Numpy
df4[np.char.find(df4['col'].values.astype(str), 'foo') > -1]
#np.vectorize
f = np.vectorize(lambda haystack, needle: needle in haystack)
f(df1['col'], 'foo')
# array([ True,  True, False, False])
df1[f(df1['col'], 'foo')]
#OR
regex_pattern = r'foo(?!$)'
p = re.compile(regex_pattern)
f = np.vectorize(lambda x: pd.notna(x) and bool(p.search(x)))
df1[f(df1['col'])]
#DataFrame.query
df1.query('col.str.contains("foo")', engine='python')
'''
Recommended Usage Precedence
(First) str.contains, for its simplicity and ease handling NaNs and mixed data
List comprehensions, for its performance (especially if your data is purely strings)
np.vectorize
(Last) df.query
'''



 












Source by stackoverflow.com #
 
PREVIOUS NEXT
Tagged: #Filter #pandas #DataFrame #substring #criteria
ADD COMMENT
Topic
Name
7+4 =