Coding for Economists

loading data and checking datatypes

check the system path

import sys
sys.path[0]

'/Users/a182501/quarto/program-chunk/posts'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(10)

df = pd.read_csv("data.csv")

df.head(10)

	positionId	positionName	companyId	companySize	industryField	financeStage	companyLabelList	firstType	secondType	thirdType	...	plus	gradeDescription	promotionScoreExplain	aggregatePositionIds	famousCompany
0	6802721	数据分析	475770	50-150人	移动互联网,电商	A轮	['绩效奖金', '带薪年假', '定期体检', '弹性工作']	产品\|需求\|项目类	数据分析	数据分析	...	NaN	NaN	NaN	[]	False
1	5204912	数据建模	50735	150-500人	电商	B轮	['年终奖金', '做五休二', '六险一金', '子女福利']	开发\|测试\|运维类	数据开发	建模	...	NaN	NaN	NaN	[]	False
2	6877668	数据分析	100125	2000人以上	移动互联网,企业服务	上市公司	['节日礼物', '年底双薪', '股票期权', '带薪年假']	产品\|需求\|项目类	数据分析	数据分析	...	NaN	NaN	NaN	[]	False
3	6496141	数据分析	26564	500-2000人	电商	D轮及以上	['生日趴', '每月腐败基金', '每月补贴', '年度旅游']	开发\|测试\|运维类	数据开发	数据分析	...	NaN	NaN	NaN	[]	True
4	6467417	数据分析	29211	2000人以上	物流丨运输	上市公司	['技能培训', '免费班车', '专项奖金', '岗位晋升']	产品\|需求\|项目类	数据分析	数据分析	...	NaN	NaN	NaN	[]	True
5	6882347	数据分析	94826	50-150人	移动互联网,社交	B轮	['股票期权', '扁平管理', '五险一金', '岗位晋升']	产品\|需求\|项目类	数据分析	数据分析	...	NaN	NaN	NaN	[]	False
6	6841659	数据分析	348784	50-150人	移动互联网,电商	A轮	['大牛团队', '扁平管理', '年底双薪', '股票期权']	产品\|需求\|项目类	数据分析	数据分析	...	NaN	NaN	NaN	[]	False
7	6764018	数据建模工程师	13163	500-2000人	移动互联网	上市公司	['绩效奖金', '股票期权', '年底双薪', '专项奖金']	开发\|测试\|运维类	数据开发	建模	...	NaN	NaN	NaN	[]	True
8	6458372	数据分析专家	34132	150-500人	数据服务,广告营销	A轮	['开放式办公', '扁平管理', '带薪假期', '弹性工作时间']	产品\|需求\|项目类	数据分析	其他数据分析	...	NaN	NaN	NaN	[]	False
9	6786904	数据分析师	13163	500-2000人	移动互联网	上市公司	['绩效奖金', '股票期权', '年底双薪', '专项奖金']	开发\|测试\|运维类	数据开发	BI工程师	...	NaN	NaN	NaN	[]	True

filter rows and columns with conditions using `df.loc[condition(s) or row(s),column(s)]`

.loc stands for location and allows you to filter a dataframe. .loc works like an index, so it always comes with square brackets

df.loc[(df["industryField"] == "电商") & (df["financeStage"] == "B轮"), ["companySize", "positionName"]]

	companySize	positionName
1	150-500人	数据建模
17	150-500人	大数据建模总监

Sort rows or columns with `.sort_values()`

Use sort_values(columns, ascending=False) for descending order.

df.sort_values(["companyId"])

	positionId	positionName	companyId	companySize	industryField	financeStage	companyLabelList	firstType	secondType	thirdType	...	plus	pcShow	appShow	deliver	gradeDescription	promotionScoreExplain	isHotHire	count	aggregatePositionIds	famousCompany
86	6496980	数据分析师-Lark	62	2000人以上	文娱丨内容	C轮	['扁平管理', '弹性工作', '大厨定制三餐', '就近租房补贴']	产品\|需求\|项目类	数据分析	其他数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	True
87	6095231	数据分析师-企业SaaS应用	62	2000人以上	文娱丨内容	C轮	['扁平管理', '弹性工作', '大厨定制三餐', '就近租房补贴']	产品\|需求\|项目类	数据分析	其他数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	True
22	5927901	数据分析经理	62	2000人以上	文娱丨内容	C轮	['扁平管理', '弹性工作', '大厨定制三餐', '就近租房补贴']	产品\|需求\|项目类	产品经理	其他产品经理	...	NaN	0	0	0	NaN	NaN	0	0	[]	True
67	5203054	资深数据分析师	329	2000人以上	电商	上市公司	['节日礼物', '技能培训', '免费班车', '带薪年假']	产品\|需求\|项目类	数据分析	数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	True
83	6872841	资深数据分析师	329	2000人以上	电商	上市公司	['节日礼物', '技能培训', '免费班车', '带薪年假']	开发\|测试\|运维类	数据开发	数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	True
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
90	6456921	数据分析专家	738016	50-150人	电商,数据服务	未融资	[]	开发\|测试\|运维类	数据开发	数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	False
102	6803432	奔驰·耀出行-BI数据分析专家	751158	150-500人	移动互联网	不需要融资	[]	开发\|测试\|运维类	数据开发	数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	False
91	6888169	奔驰耀出行-战略数据分析师	751158	150-500人	移动互联网	不需要融资	[]	产品\|需求\|项目类	数据分析	数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	False
98	6655562	数据分析建模工程师	117422215	50-150人	数据服务,信息安全	A轮	['午餐补助', '带薪年假', '16到18薪', '法定节假日']	开发\|测试\|运维类	人工智能	机器学习	...	NaN	0	0	0	NaN	NaN	0	0	[]	False
99	6677939	数据分析建模工程师（校招）	117422215	50-150人	数据服务,信息安全	A轮	['午餐补助', '带薪年假', '16到18薪', '法定节假日']	开发\|测试\|运维类	人工智能	算法工程师	...	NaN	0	0	0	NaN	NaN	0	0	[]	False

the sort_values also could be used to sort name by character

df.sort_values(["industryField"])

	positionId	positionName	companyId	companySize	industryField	financeStage	companyLabelList	firstType	secondType	thirdType	...	plus	pcShow	appShow	deliver	gradeDescription	promotionScoreExplain	isHotHire	count	aggregatePositionIds	famousCompany
80	6310387	业务与数据分析师	93448	150-500人	人工智能,数据服务	B轮	['技能培训', '股票期权', '带薪年假', '绩效奖金']	产品\|需求\|项目类	数据分析	数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	False
44	6653757	银行数据分析岗	23403	2000人以上	企业服务	上市公司	['五险一金', '通讯津贴', '带薪年假', '定期体检']	开发\|测试\|运维类	数据开发	数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	False
93	6785139	数据分析师	665061	50-150人	企业服务	B轮	[]	开发\|测试\|运维类	数据开发	数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	False
16	6486988	资深数据分析师（杭州）	7461	2000人以上	企业服务	上市公司	['工程师氛围', '弹性工作', '扁平管理', '上班不打卡']	开发\|测试\|运维类	数据开发	数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	True
15	6882983	产品运营（偏数据分析）	7461	2000人以上	企业服务	上市公司	['工程师氛围', '弹性工作', '扁平管理', '上班不打卡']	运营\|编辑\|客服类	运营	数据运营	...	NaN	0	0	0	NaN	NaN	0	0	[]	True
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
36	6886661	浙江数据分析师	321001	150-500人	金融,数据服务	D轮及以上	['年底双薪', '午餐补助', '年终分红', '绩效奖金']	开发\|测试\|运维类	数据开发	数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	False
74	6837340	数据分析-2020届春招	205347	2000人以上	金融,电商	上市公司	['带薪年假', '定期体检', '免费班车', '领导好']	产品\|需求\|项目类	数据分析	数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	True
68	6829736	数据分析负责人 or 数据分析师	205347	2000人以上	金融,电商	上市公司	['带薪年假', '定期体检', '免费班车', '领导好']	开发\|测试\|运维类	数据开发	数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	True
26	6850849	数据分析专家	255742	150-500人	金融,电商	C轮	['持牌金融机构', '跨境支付', '跨境金融', '国际化团队']	开发\|测试\|运维类	数据开发	数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	False
82	6820395	产品经理/数据分析（核心业务）-2020届春招	205347	2000人以上	金融,电商	上市公司	['带薪年假', '定期体检', '免费班车', '领导好']	产品\|需求\|项目类	产品经理	产品经理	...	NaN	0	0	0	NaN	NaN	0	0	[]	True

Choose multiple rows or columns using slices

df.loc[2::10, "positionId":"financeStage"]

	positionId	positionName	companyId	companySize	industryField	financeStage
2	6877668	数据分析	100125	2000人以上	移动互联网,企业服务	上市公司
12	6763962	数据分析工程师	13163	500-2000人	移动互联网	上市公司
22	5927901	数据分析经理	62	2000人以上	文娱丨内容	C轮
32	6804489	资深数据分析师	34132	150-500人	数据服务,广告营销	A轮
42	6344146	资深数据分析师	522865	150-500人	游戏	不需要融资
52	6486069	解决方案顾问/数据分析师	166666	150-500人	企业服务,数据服务	B轮
62	6191993	数据分析专家03-10-217	18655	2000人以上	汽车丨出行	D轮及以上
72	6794326	BI数据分析师	374014	500-2000人	移动互联网,金融	B轮
82	6820395	产品经理/数据分析（核心业务）-2020届春招	205347	2000人以上	金融,电商	上市公司
92	6813626	资深数据分析专员	165939	150-500人	数据服务	不需要融资
102	6803432	奔驰·耀出行-BI数据分析专家	751158	150-500人	移动互联网	不需要融资

2::10 means start from 2 and the 10 as the interval.

df.iloc[:5, -5:]

	promotionScoreExplain	aggregatePositionIds	famousCompany
0	NaN	[]	False
1	NaN	[]	False
2	NaN	[]	False
3	NaN	[]	True
4	NaN	[]	True

We could treat the missing value behind or after : is 0. So the synatax is identical to df.iloc[0:5,-5:]

df.iloc[0:5,-5:]

	promotionScoreExplain	aggregatePositionIds	famousCompany
0	NaN	[]	False
1	NaN	[]	False
2	NaN	[]	False
3	NaN	[]	True
4	NaN	[]	True

Randomly selecting a sample using `.sample`

df.sample(5)

	positionId	positionName	companyId	companySize	industryField	financeStage	companyLabelList	firstType	secondType	thirdType	...	plus	gradeDescription	promotionScoreExplain	aggregatePositionIds	famousCompany
85	6792091	高级数据分析师	23252	500-2000人	移动互联网	上市公司	['包午餐晚餐', '奖金多多多', '零食下午茶', '全员出国游']	产品\|需求\|项目类	数据分析	数据分析	...	NaN	NaN	NaN	[]	False
35	6284662	bi数据分析师	50354	2000人以上	物流丨运输	D轮及以上	['技能培训', '节日礼物', '年底双薪', '带薪年假']	开发\|测试\|运维类	数据开发	BI工程师	...	NaN	NaN	NaN	[]	False
48	6067812	数据分析专员	98316	2000人以上	电商,消费生活	上市公司	['午餐补助', '带薪年假', '定期体检', '年度旅游']	产品\|需求\|项目类	数据分析	数据分析	...	NaN	NaN	NaN	[]	False
47	6339988	数据分析师/BI	238706	150-500人	电商,移动互联网	不需要融资	['年底双薪', '绩效奖金', '带薪年假', '定期体检']	开发\|测试\|运维类	数据开发	数据分析	...	NaN	NaN	NaN	[]	False
82	6820395	产品经理/数据分析（核心业务）-2020届春招	205347	2000人以上	金融,电商	上市公司	['带薪年假', '定期体检', '免费班车', '领导好']	产品\|需求\|项目类	产品经理	产品经理	...	NaN	NaN	NaN	[]	True

Rename with `.rename`

df.rename(columns={"companySize": "corporationSize"})

	positionId	positionName	companyId	corporationSize	industryField	financeStage	companyLabelList	firstType	secondType	thirdType	...	plus	pcShow	appShow	deliver	gradeDescription	promotionScoreExplain	isHotHire	count	aggregatePositionIds	famousCompany
0	6802721	数据分析	475770	50-150人	移动互联网,电商	A轮	['绩效奖金', '带薪年假', '定期体检', '弹性工作']	产品\|需求\|项目类	数据分析	数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	False
1	5204912	数据建模	50735	150-500人	电商	B轮	['年终奖金', '做五休二', '六险一金', '子女福利']	开发\|测试\|运维类	数据开发	建模	...	NaN	0	0	0	NaN	NaN	0	0	[]	False
2	6877668	数据分析	100125	2000人以上	移动互联网,企业服务	上市公司	['节日礼物', '年底双薪', '股票期权', '带薪年假']	产品\|需求\|项目类	数据分析	数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	False
3	6496141	数据分析	26564	500-2000人	电商	D轮及以上	['生日趴', '每月腐败基金', '每月补贴', '年度旅游']	开发\|测试\|运维类	数据开发	数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	True
4	6467417	数据分析	29211	2000人以上	物流丨运输	上市公司	['技能培训', '免费班车', '专项奖金', '岗位晋升']	产品\|需求\|项目类	数据分析	数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	True
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
100	6884346	数据分析师	21236	500-2000人	移动互联网,医疗丨健康	C轮	['技能培训', '年底双薪', '节日礼物', '绩效奖金']	产品\|需求\|项目类	数据分析	数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	False
101	6849100	商业数据分析	72076	500-2000人	移动互联网,电商	C轮	['节日礼物', '股票期权', '带薪年假', '年度旅游']	市场\|商务类	市场\|营销	商业数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	False
102	6803432	奔驰·耀出行-BI数据分析专家	751158	150-500人	移动互联网	不需要融资	[]	开发\|测试\|运维类	数据开发	数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	False
103	6704835	BI数据分析师	52840	2000人以上	电商	上市公司	['技能培训', '年底双薪', '节日礼物', '绩效奖金']	开发\|测试\|运维类	数据开发	数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	True
104	6728058	数据分析专家-LQ(J181203029)	2474	2000人以上	汽车丨出行	不需要融资	['弹性工作', '节日礼物', '岗位晋升', '技能培训']	产品\|需求\|项目类	数据分析	其他数据分析	...	NaN	0	0	0	NaN	NaN	0	0	[]	True

Add new column with `.assign`

df['companyIds']=df['companyId']*10

df.head()

	positionId	positionName	companyId	companySize	industryField	financeStage	companyLabelList	firstType	secondType	thirdType	...	gradeDescription	promotionScoreExplain	aggregatePositionIds	famousCompany	companyIds
0	6802721	数据分析	475770	50-150人	移动互联网,电商	A轮	['绩效奖金', '带薪年假', '定期体检', '弹性工作']	产品\|需求\|项目类	数据分析	数据分析	...	NaN	NaN	[]	False	4757700
1	5204912	数据建模	50735	150-500人	电商	B轮	['年终奖金', '做五休二', '六险一金', '子女福利']	开发\|测试\|运维类	数据开发	建模	...	NaN	NaN	[]	False	507350
2	6877668	数据分析	100125	2000人以上	移动互联网,企业服务	上市公司	['节日礼物', '年底双薪', '股票期权', '带薪年假']	产品\|需求\|项目类	数据分析	数据分析	...	NaN	NaN	[]	False	1001250
3	6496141	数据分析	26564	500-2000人	电商	D轮及以上	['生日趴', '每月腐败基金', '每月补贴', '年度旅游']	开发\|测试\|运维类	数据开发	数据分析	...	NaN	NaN	[]	True	265640
4	6467417	数据分析	29211	2000人以上	物流丨运输	上市公司	['技能培训', '免费班车', '专项奖金', '岗位晋升']	产品\|需求\|项目类	数据分析	数据分析	...	NaN	NaN	[]	True	292110

Summarize numerical values with `.describe`

df.describe()

	positionId	companyId	salary	publisherId	approve	latitude	longitude	resumeProcessRate	resumeProcessDay	score	...	adWord	plus	pcShow	appShow	deliver	gradeDescription	promotionScoreExplain	isHotHire	count	companyIds
count	1.050000e+02	1.050000e+02	105.000000	1.050000e+02	105.000000	105.000000	105.000000	105.000000	105.000000	105.000000	...	105.0	0.0	105.0	105.0	105.0	0.0	0.0	105.0	105.0	1.050000e+02
mean	6.491543e+06	2.397016e+06	31723.809524	6.810574e+06	0.990476	30.280568	120.142645	38.876190	0.790476	12.714286	...	0.0	NaN	0.0	0.0	0.0	NaN	NaN	0.0	0.0	2.397016e+07
std	4.290726e+05	1.610641e+07	9858.367432	4.262766e+06	0.097590	0.143060	0.200043	41.411095	0.742854	30.953972	...	0.0	NaN	0.0	0.0	0.0	NaN	NaN	0.0	0.0	1.610641e+08
min	5.203054e+06	6.200000e+01	3500.000000	8.179300e+04	0.000000	30.129580	119.990918	0.000000	0.000000	0.000000	...	0.0	NaN	0.0	0.0	0.0	NaN	NaN	0.0	0.0	6.200000e+02
25%	6.234992e+06	2.118700e+04	27500.000000	4.097927e+06	1.000000	30.213041	120.032234	0.000000	0.000000	3.000000	...	0.0	NaN	0.0	0.0	0.0	NaN	NaN	0.0	0.0	2.118700e+05
50%	6.680900e+06	9.831600e+04	30000.000000	6.714094e+06	1.000000	30.280177	120.113116	23.000000	1.000000	5.000000	...	0.0	NaN	0.0	0.0	0.0	NaN	NaN	0.0	0.0	9.831600e+05
75%	6.804489e+06	2.316980e+05	37500.000000	9.293747e+06	1.000000	30.290310	120.201179	85.000000	1.000000	6.000000	...	0.0	NaN	0.0	0.0	0.0	NaN	NaN	0.0	0.0	2.316980e+06
max	6.896403e+06	1.174222e+08	60000.000000	1.650818e+07	1.000000	31.240897	121.417492	100.000000	4.000000	233.000000	...	0.0	NaN	0.0	0.0	0.0	NaN	NaN	0.0	0.0	1.174222e+09

This only be effected to numerical variable

working with data

import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Set seed for random numbers
seed_for_prng = 78557
prng = np.random.default_rng(seed_for_prng)  # prng=probabilistic random number generator

penguins = sns.load_dataset("penguins")
penguins.head()

URLError: <urlopen error [Errno 61] Connection refused>

type(penguins)

penguins.info()

Everything in Python is an object, and our dataframe is no exception. Each dataframe is made up of a set of series that, become columns: but you can turn a single series into a dataframe too.
In intuitively, many series combine by column are data frame

s1 = pd.Series([1.0, 6.0, 19.0, 2.0])
s1

0     1.0
1     6.0
2    19.0
3     2.0
dtype: float64

population_dict = {
    "California": 38332521,
    "Texas": 26448193,
    "New York": 19651127,
    "Florida": 19552860,
    "Illinois": 12882135,
}
s2 = pd.Series(population_dict)
s2

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

s2.values

array([38332521, 26448193, 19651127, 19552860, 12882135])

df = pd.DataFrame({
    "A": 1.0,
    "B": pd.Series(1, index=list(range(4)), dtype="float32"),
    "C": [3] * 4,
    "D": pd.Categorical(["test", "train", "test", "train"]),
    "E": "foo",
})
df

	A	B	C	D	E
0	1.0	1.0	3	test	foo
1	1.0	1.0	3	train	foo
2	1.0	1.0	3	test	foo
3	1.0	1.0	3	train	foo

Another way to create dataframes is to pass a bunch of series

df = pd.DataFrame(
    data=np.reshape(range(36), (6, 6)),
    index=["a", "b", "c", "d", "e", "f"],
    columns=["col" + str(i) for i in range(6)],
    dtype=float,
)
df

	col0	col1	col2	col3	col4	col5
a	0.0	1.0	2.0	3.0	4.0	5.0
b	6.0	7.0	8.0	9.0	10.0	11.0
c	12.0	13.0	14.0	15.0	16.0	17.0
d	18.0	19.0	20.0	21.0	22.0	23.0
e	24.0	25.0	26.0	27.0	28.0	29.0
f	30.0	31.0	32.0	33.0	34.0	35.0

import pandas as pd
df = pd.DataFrame(
    data={
        "col0": [0, 0, 0, 0],
        "col1": [0, 0, 0, 0],
        "col2": [0, 0, 0, 0],
        "col3": ["a", "b", "b", "a"],
        "col4": ["alpha", "gamma", "gamma", "gamma"],
    },
    index=["row" + str(i) for i in range(4)],
)
df.head()
df[["col0", "col1", "col2"]] + 1

	col0	col1	col2
row0	1	1	1
row1	1	1	1
row2	1	1	1
row3	1	1	1

from skimpy import skim

skim(df)

╭──────────────────────────────────────────────── skimpy summary ─────────────────────────────────────────────────╮
│          Data Summary                Data Types                                                                 │
│ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓ ┏━━━━━━━━━━━━━┳━━━━━━━┓                                                          │
│ ┃ dataframe         ┃ Values ┃ ┃ Column Type ┃ Count ┃                                                          │
│ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩                                                          │
│ │ Number of rows    │ 4      │ │ int64       │ 3     │                                                          │
│ │ Number of columns │ 5      │ │ string      │ 2     │                                                          │
│ └───────────────────┴────────┘ └─────────────┴───────┘                                                          │
│                                                     number                                                      │
│ ┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┓  │
│ ┃ column_name          ┃ NA    ┃ NA %     ┃ mean     ┃ sd    ┃ p0    ┃ p25    ┃ p75    ┃ p100    ┃ hist      ┃  │
│ ┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━┩  │
│ │ col0                 │     0 │        0 │        0 │     0 │     0 │      0 │      0 │       0 │      █    │  │
│ │ col1                 │     0 │        0 │        0 │     0 │     0 │      0 │      0 │       0 │      █    │  │
│ │ col2                 │     0 │        0 │        0 │     0 │     0 │      0 │      0 │       0 │      █    │  │
│ └──────────────────────┴───────┴──────────┴──────────┴───────┴───────┴────────┴────────┴─────────┴───────────┘  │
│                                                     string                                                      │
│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓  │
│ ┃ column_name               ┃ NA      ┃ NA %       ┃ words per row                ┃ total words              ┃  │
│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━┩  │
│ │ col3                      │       0 │          0 │                            1 │                        4 │  │
│ │ col4                      │       0 │          0 │                            1 │                        4 │  │
│ └───────────────────────────┴─────────┴────────────┴──────────────────────────────┴──────────────────────────┘  │
╰────────────────────────────────────────────────────── End ──────────────────────────────────────────────────────╯

Manipulating Rows in Data Frames

import numpy as np

df = pd.DataFrame(
    data=np.reshape(range(36), (6, 6)),
    index=["a", "b", "c", "d", "e", "f"],
    columns=["col" + str(i) for i in range(6)],
    dtype=float,
)
df["col6"] = ["apple", "orange", "pineapple", "mango", "kiwi", "lemon"]
df

	col0	col1	col2	col3	col4	col5	col6
a	0.0	1.0	2.0	3.0	4.0	5.0	apple
b	6.0	7.0	8.0	9.0	10.0	11.0	orange
c	12.0	13.0	14.0	15.0	16.0	17.0	pineapple
d	18.0	19.0	20.0	21.0	22.0	23.0	mango
e	24.0	25.0	26.0	27.0	28.0	29.0	kiwi
f	30.0	31.0	32.0	33.0	34.0	35.0	lemon

df.loc[["a", "b"]]

	col0	col1	col2	col3	col4	col5	col6
a	0.0	1.0	2.0	3.0	4.0	5.0	apple
b	6.0	7.0	8.0	9.0	10.0	11.0	orange

Categorical Data

import numpy as np
import pandas as pd

df = pd.DataFrame({"A": ["a", "b", "c", "a"]})

df["A"] = df["A"].astype("category")
df["A"]

0    a
1    b
2    c
3    a
Name: A, dtype: category
Categories (3, object): ['a', 'b', 'c']

df = pd.DataFrame({"value": np.random.randint(0, 100, 20)})
labels = [f"{i} - {i+9}" for i in range(0, 100, 10)]
df["group"] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels)
df.head()

	value	group
0	52	50 - 59
1	16	10 - 19
2	49	40 - 49
3	86	80 - 89
4	38	30 - 39

the group column is of categorical type. Another way to create a categorical variable is directly using the pd.Categorical function:

raw_cat = pd.Categorical(["a", "b", "c", "a", "d", "a", "c"],
                         categories=["b", "c", "d"])
raw_cat

[NaN, 'b', 'c', NaN, 'd', NaN, 'c']
Categories (3, object): ['b', 'c', 'd']

df = pd.DataFrame(raw_cat, columns=["cat_type"])
df["cat_type"]

0    NaN
1      b
2      c
3    NaN
4      d
5    NaN
6      c
Name: cat_type, dtype: category
Categories (3, object): ['b', 'c', 'd']

using special functions, such as pd.cut(), to groups data into discrete bins.

ordered_cat = pd.Categorical(
    ["a", "b", "c", "a", "d", "a", "c"],
    categories=["a", "b", "c", "d"],
    ordered=True,
)
ordered_cat

['a', 'b', 'c', 'a', 'd', 'a', 'c']
Categories (4, object): ['a' < 'b' < 'c' < 'd']

pd.qcut(range(1,10),4)

[(0.999, 3.0], (0.999, 3.0], (0.999, 3.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], (5.0, 7.0], (7.0, 9.0], (7.0, 9.0]]
Categories (4, interval[float64, right]): [(0.999, 3.0] < (3.0, 5.0] < (5.0, 7.0] < (7.0, 9.0]]

Working with Categories

df['cat_type']

0    NaN
1      b
2      c
3    NaN
4      d
5    NaN
6      c
Name: cat_type, dtype: category
Categories (3, object): ['b', 'c', 'd']

df['cat_type'].cat

<pandas.core.arrays.categorical.CategoricalAccessor object at 0x13d87fd60>

df['cat_type'].cat.categories

Index(['b', 'c', 'd'], dtype='object')

df['cat_type'].cat.ordered

False

Operation on Categories

df['cat_type'].value_counts()

c    2
b    1
d    1
Name: cat_type, dtype: int64

Note that even though ‘delta’ doesn’t appear at all, it gets a count (of zero). This tracking of missing values can be quite handy.

df["cat_type"].mode()

0    c
Name: cat_type, dtype: category
Categories (3, object): ['b', 'c', 'd']