六、Pandas

# 六、Pandas

import pandas as pd
import numpy as np

1
2

# 6.1 Series

Series 是 Pandas 的核心数据结构之一，是一维数据结构。包含两个部分：index、values，基础结构都是 ndarray。

# 6.1.1 创建

dict = {'a':10, 'b':2, 'c':3}
data = pd.Series(dict)

data2 = pd.Series([1,2,3], index=['A','B','C'])
data2

1
2
3
4
5

# 6.1.2 访问

可下标、也可通过 key

dict = {'a':10, 'b':2, 'c':3}
data = pd.Series(dict)

print(data['a'], data[1])

1
2
3
4

# 6.1.3 修改索引 index

dict = {'a':10, 'b':2, 'c':3}
data = pd.Series(dict)
data.index = ['O','P','Q']

data

1
2
3
4
5

# 6.1.4 拼接 concat

data1 = pd.Series({'A':1,'BB':2})
data2 = pd.Series(['O','PP','QQ'],index=['R','S','T'])

data = pd.concat([data1,data2])
data

1
2
3
4
5

# 6.2 DataFrame

类似 Excel 表格的二维数据结构

# 6.2.1 创建

import numpy as np
import pandas as pd
matrix = np.random.rand(3,3)
df = pd.DataFrame(matrix,index=list('ABC'),columns=list('XYZ'))

display(df)

df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
display(df1)

1
2
3
4
5
6
7
8
9

	X	Y	Z
A	0.883420	0.997401	0.645058
B	0.716311	0.877419	0.165533
C	0.842964	0.010090	0.063684

	A	B
0	1	3
1	2	4

# 6.2.2 列操作

# 基础

df[index] = [,,,]
df.index
按序号：df.iloc[:,:]
按索引：df.loc[:, :]

print(df[['X','Y']])

print(df.iloc[:,1:2]) # 左闭右开

1
2
3

df['SSS'] = [1,1,1]
df

1
2

# 删除 del

del df['SSS']

df

1
2
3

# 6.2.3 行操作

# 基础

iloc[][]
loc[][]

x = df.iloc[0]
x

1
2

# 创建

se = pd.Series([1,2,3],index=list('XYZ'),name='t')
newDf = pd.concat([df, se.to_frame().T])

print(newDf)

1
2
3
4

# 删除

newDf.drop(['A'],axis=0)

# 6.2.4 数据查询

arr = np.array([[1,2,3],[4,5,6],[7,8,9]])
df = pd.DataFrame(arr, index=list('123'),columns=list('ABC'))

display(df)

1
2
3
4

# 按区间范围

df.loc['1':'2','A']

# 按条件表达

df.loc[df['C']>=6,:]

# 按值查询

df.loc['1','C']

# 按列表

df.loc[['1','2'],['C','B']]

# 按自定义函数查找

df.loc[lambda df: df['C'] > 5, :]

# 6.2.5 数据统计

df = pd.DataFrame(np.random.rand(3,3),columns=list('ABC'),index=[1,2,3])
df

1
2

# 排序

注意，此时 index 需要是 int 类型

df2 = df.sort_index(ascending=False) # ascending=True 升序
df2

1
2

# 统计指标

file = pd.read_csv('./pandas/test1.CSV')
display(file)

file.describe()

# print(file['size'].mean())

1
2
3
4
5
6

# 分类汇总

GroupBy 可以计算目标类别的统计特征，例如按“place_of_production”、“level”将物品分类，并计算所有数字列的统计特征

file2 = pd.read_csv('./pandas/test2.CSV')

display(file2)
file2.groupby(['place_of_production','level']).describe()

1
2
3
4

按 place_of_production、level 排序，分析 number 的均值、和

file2.groupby(['place_of_production','level'])['number'].agg([np.mean, np.sum])

df3 = file2.groupby('place_of_production')

display(df3.describe())

for name, group in df3:
    print(name)
    print(group)
    print('=============')

1
2
3
4
5
6
7
8

# 6.2.6 常用方法

# df.dropna(inplace=True)

删除含有缺失值（NaN）的行或列

inplace

If False, return a copy. Otherwise, do operation in place and return None.

# 行数、列数

df.shape[0]
df.shape[1]

五、Numpy 七、SciPy

Hola，最近还好吗？

Choose mode

六、Pandas

# 六、Pandas

# 6.1 Series

# 6.1.1 创建

# 6.1.2 访问

# 6.1.3 修改索引 index

# 6.1.4 拼接 concat

# 6.2 DataFrame

# 6.2.1 创建

# 6.2.2 列操作

# 基础

# 删除 del

# 6.2.3 行操作

# 基础

# 创建

# 删除

# 6.2.4 数据查询

# 按区间范围

# 按条件表达

# 按值查询

# 按列表

# 按自定义函数查找

# 6.2.5 数据统计

# 排序

# 统计指标

# 分类汇总

# 6.2.6 常用方法

# df.dropna(inplace=True)

# 行数、列数