# install pandas (basic, if path is not set yet)
py -m pip install pandas
# or set PATH to use pip:
setx PATH "%PATH%;C:<path opythondirectory>Scripts"
pip install pandas
# if "connection error: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed" [!]:
py -m pip install --trusted-host pypi.python.org pip pandas
# if PermissionError: [WinError 5] Access is denied
py -m pip install --user pandas
# or via creating a virtual environment venv:
py -m venv c:path o
ewenvironment
# then execute:
c:path o
ewenvironmentScriptsactivate.bat
# To install Pandas: "pip install pandas"
# To import pandas:
import pandas as pd
import numpy as np # extra import which will be used in examples
# Create simple pandas series
>>> s = pd.Series([1, 3, 5, np.nan, 6, 8])
>>> s
0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
# Create pandas DataFrame from CSV:
df = pd.read_csv('path_to.csv')
# Create timeseries pandas DataFrame
>>> dates = pd.date_range("20130101", periods=6)
>>> df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
>>> df
A B C D
2013-01-01 0.469112 -0.282863 -1.509059 -1.135632
2013-01-02 1.212112 -0.173215 0.119209 -1.044236
2013-01-03 -0.861849 -2.104569 -0.494929 1.071804
2013-01-04 0.721555 -0.706771 -1.039575 0.271860
2013-01-05 -0.424972 0.567020 0.276232 -1.087401
2013-01-06 -0.673690 0.113648 -1.478427 0.524988
# Create pandas DataFrame from a dictionary
>>> d = {
"A": 1.0,
"B": pd.Timestamp("20130102"),
"C": pd.Series(1, index=list(range(4)), dtype="float32"),
"D": np.array([3] * 4, dtype="int32"),
"E": pd.Categorical(["test", "train", "test", "train"]),
"F": "foo",
}
>>> df = pd.DataFrame(d)
>>> df
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
# Other important functions:
df.head() # view first 5 rows
df.tail() # view last 5 rows
df.index # get index
df.columns # get columns
df.describe() # describes df
df.T # transpose
df.sort_index() # sort dataframe rows by index
df.sort_values('columns_name') # sort dataframe rows by specific column
# select column
df['column_name']
df.column_name # alternative
# selection
df[0:3] # select rows by index number
df["20130102":"20130104"] # select rows by index values
df.loc["20130102"] # select by label
df.loc[slice(None), ["A", "B"]] # select all rows "slice(None)" and only columns "A","B"
df.iloc[3] # select by position
# And many more can be found here: https://pandas.pydata.org/docs/user_guide/10min.html
#for dropping a column in a dataframe
df = df.drop(['PassengerId'], axis = 1)
#for selecting all columns except one
df.iloc[:, df.columns != "Survived"]
#for checking nan values is a column
df['your column name'].isnull()