=
!=
>=
str.contains()
&
|
.isin()
.isnull()
.notnull()
.duplicated()
.drop_duplicates()
.between
######
Pandas common 3: filtering like WHERE clause in sql¶
Dataframe¶
In [ ]:
number,name,species,dob
1.0,joe,human,10-02-1977
2.0,john,human,01-04-1954
3.0,mike,human,04-29-1966
4.0,didi,cat,07-12-2019
5.0,aaron,human,09-08-1990
6.0,boo,dog,02-03-2015
7.0,ziggy,dog,08-09-2010
8.0,balou,gorilla,12-10-2005
9.0,"",""
In [248]:
import pandas as pd
df = pd.read_csv("test5.csv")
df
Out[248]:
In [249]:
df["name"] == "joe"
Out[249]:
In [250]:
#do a filter first then parse the boolean into the dataframe
df[df["species"] == "dog"]
Out[250]:
In [251]:
#assign to a parameter then selectively querying them
animal = df["species"] == "dog"
df[animal]
df [["species","name"]]
Out[251]:
In [252]:
animal = df["species"] != "dog"
df[animal]
Out[252]:
note: for date it works like string df["date"] >= "2020-01-01"¶
In [253]:
greater_date = df["dob"] >= "09-09-2005"
df[greater_date]
Out[253]:
In [254]:
greater_number = df["number"] >= 4.0
df[greater_number]
Out[254]:
use "str.contains()" as "LIKE" operator¶
note: other noteworthy parameter for str.contains .. case=False and na=False¶
In [255]:
#like any
df["species"].str.contains("or")
Out[255]:
In [256]:
#start with d
df["species"].str.startswith("d")
Out[256]:
In [257]:
#end with n
df["species"].str.endswith("n")
Out[257]:
"&" as "AND" operator¶
In [258]:
greater_number = df["number"] >= 4.0
species_type = df["species"] == "dog"
df[greater_number & species_type]
Out[258]:
" | " as "OR" operator in SQL¶
In [259]:
greater_number = df["number"] >= 4.0
species_type = df["species"] == "dog"
df[greater_number | species_type]
Out[259]:
In [260]:
greater_number = df["number"] == 4.0
species_type = df["species"] == "dog"
#comparing if greater is true and equal to true on specfies type. It only take row that is both row equal to true
df[greater_number == species_type]
Out[260]:
In [261]:
greater_number = df["number"] >= 4.0
species_type = df["species"] == "dog"
df[species_type]
Out[261]:
In [262]:
df["number"] == 4.0
Out[262]:
In [263]:
df["species"] == "dog"
Out[263]:
".isin()" as "in" operator¶
In [264]:
isin = df["species"].isin(["dog","gorilla"])
df[isin]
Out[264]:
"isnull()" as "IS NULL" operator¶
In [265]:
isnull = df["species"].isnull()
df[isnull]
Out[265]:
"notnull" as "IS NOT NULL" operator¶
In [266]:
notnull = df["species"].notnull()
df[notnull]
Out[266]:
"duplicated()"" somewhat similar to "select column,count() from table having count() > 1 group by column"¶
In [267]:
#keep = false is to print out every duplicates. Other parameters such as keep='first' or keep='last'
duplicate = df["species"].duplicated(keep = False)
df[duplicate]
Out[267]:
"drop_duplicates()" will drop the duplicates and print out what's left¶
In [268]:
dropduplicate = df["species"].drop_duplicates(keep=False)
dropduplicate
Out[268]:
"between" as "BETWEEN" operator¶
In [269]:
between = df["number"].between(2,5)
df[between]
Out[269]:
"loc[a:b]"" as select a range of index like bewtween for variables¶
In [270]:
betweenInd = df.loc['1':'3']
betweenInd
Out[270]:
".query()" is simplify form of where clause¶
In [271]:
df.query('species == "human"')
Out[271]:
In [273]:
df.info()
In [285]:
df["number"].astype(int)
df.describe
df.describe()
df["name"].notnull()
df[df["name"].notnull()]
Out[285]:
In [ ]: