一、NumPy
1、NumPy简介
NumPy is the fundamental package for scientific computing in Python. It is a Python library that provides a multidimensional array object, various derived objects (such as masked arrays and matrices), and an assortment of routines for fast operations on arrays, including mathematical, logical, shape manipulation, sorting, selecting, I/O, discrete Fourier transforms, basic linear algebra, basic statistical operations, random simulation and much more.
2、NumPy的基础操作
创建ndarray
# numpy的简单操作
import numpy as np
data = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
arr = np.array(data)
print(data)
print(arr)
print(type(arr))
print(arr.ndim) # 维度
print(arr.shape) # 形状
print(arr.dtype) # 数据类型
''' 输出:
[[1, 2, 3], [4, 5, 6], [7, 8, 9]]
[[1 2 3]
[4 5 6]
[7 8 9]]
<class 'numpy.ndarray'>
2
(3, 3)
int32
'''
# ndarray的创建
import numpy as np
data = [[1, 2, 3], [4, 5, 6]]
arr1 = np.array(data) # 会创建新副本
arr2 = np.asarray(data) # 不会创建新副本
arr3 = np.ones([2, 3], dtype=np.int64) # 创建全为1的矩阵
arr4 = np.zeros([2, 3]) # 创建全为0的矩阵
arr5 = np.empty([2, 3]) # 创建矩阵,并填入无意义值
arr6 = np.full([2, 3], fill_value=1.23) # 指定填充数值创建矩阵
arr7 = np.eye(2) # 创建n*n的对角阵
print('arr1:\n', arr1)
print('arr2:\n', arr2)
print('arr3:\n', arr3)
print('arr4:\n', arr4)
print('arr5:\n', arr5)
print('arr6:\n', arr6)
print('arr7:\n', arr7)
''' 输出
arr1:
[[1 2 3]
[4 5 6]]
arr2:
[[1 2 3]
[4 5 6]]
arr3:
[[1 1 1]
[1 1 1]]
arr4:
[[0. 0. 0.]
[0. 0. 0.]]
arr5:
[[1.13863948e-311 1.13863947e-311 1.13863947e-311]
[1.13942031e-311 5.92878775e-323 4.59481051e-322]]
arr6:
[[1.23 1.23 1.23]
[1.23 1.23 1.23]]
arr7:
[[1. 0.]
[0. 1.]]
'''
准备工作(获得一个矩阵)
import os
import numpy as np
import requests
from PIL import Image
isExist = os.path.exists('./cat.jpg')
if not isExist:
url = 'https://bkimg.cdn.bcebos.com/pic/a8ec8a13632762d0f703fa1f08ba1ffa513d2697684e'
open('./cat.jpg', 'wb').write(requests.get(url).content)
cat = Image.open('./cat.jpg') # 读取图片
print(type(cat))
cat_data = np.array(cat) # 将图片数据转换为矩阵
print(cat_data) # jpeg范围为0~255的整数,png范围为0~1的浮点数
print(cat_data.min())
print(cat_data.max())
print(cat_data.shape)
''' 输出:
<class 'PIL.JpegImagePlugin.JpegImageFile'>
[[[231 212 195]
[231 213 193]
[232 214 194]
...
[189 137 97]
[189 137 97]
[189 137 97]]
[[231 213 193]
[231 213 193]
[232 214 192]
...
[187 138 97]
[189 137 97]
[189 137 97]]
[[230 212 192]
[231 213 191]
[230 214 191]
...
[187 138 97]
[187 138 97]
[187 138 97]]
...
[[228 208 183]
[228 208 183]
[228 208 183]
...
[203 178 156]
[203 178 156]
[203 178 156]]
[[228 208 183]
[228 208 183]
[228 208 183]
...
[202 177 155]
[202 177 155]
[202 177 155]]
[[228 208 183]
[228 208 183]
[228 208 183]
...
[202 177 155]
[202 177 155]
[202 177 155]]]
0
255
(1357, 1080, 3)
进程已结束,退出代码0
'''
猫猫图片
图片是由像素组成的一个矩阵,因此图片至少是二位矩阵。但图片需要包含色彩信息,因此需要一个三位矩阵分别携带RGB的色彩信息。
# 仅保留G-Green像素信息的图片
import numpy as np
from PIL import Image
cat = Image.open('./cat.jpg')
cat_data = np.array(cat)
print(cat_data.shape)
for i in range(len(cat_data)):
for j in range(len(cat_data[0])):
cat_data[i][j][0] = 0 # 清除R-Red的像素信息
cat_data[i][j][2] = 0 # 清除B-Blue的像素信息
Image.fromarray(cat_data).show()
''' 输出:
(1357, 1080, 3)
'''
仅保留G-Green像素信息的图片
切片
# 像素裁切
import numpy as np
from PIL import Image
cat = Image.open('./cat.jpg')
cat_data = np.array(cat)
cat2 = cat_data[400:1100, 300:800] # 像素裁切
Image.fromarray(cat2).show()
切片后的图片
变形
# 矩阵变形
import numpy as np
arr = np.array([[1, 2, 3], [4, 5, 6]])
print('arr:\n', arr)
print('arr_reshape(3,2):\n', arr.reshape(3, 2))
print('arr_reshape(6):\n', arr.reshape(6))
''' 输出:
arr:
[[1 2 3]
[4 5 6]]
arr_reshape(3,2):
[[1 2]
[3 4]
[5 6]]
arr_reshape(6):
[1 2 3 4 5 6]
'''
# 实现图片旋转
import numpy as np
from PIL import Image
cat = Image.open('./cat.jpg')
cat_data = np.array(cat)
cat2 = cat_data.transpose(1, 0, 2) # (1357, 1080, 3)将行列的索引进行交换
Image.fromarray(cat2).show()
旋转的图片
级联
# 实现图片连接
import numpy as np
from PIL import Image
cat = Image.open('./cat.jpg')
cat_data = np.array(cat)
cat2 = np.concatenate((cat_data, cat_data), axis=1) # axis默认为0,表示x轴扩展(行扩展);1表示y轴扩展(列扩展)
print(cat_data.shape)
print(cat2.shape)
Image.fromarray(cat2).show()
''' 输出:
(1357, 1080, 3)
(1357, 2160, 3)
'''
图片级联
# 也可以使用hstack()或vstack()方法进行级联
import numpy as np
arr = np.array([[1, 2, 3], [4, 5, 6]])
print(np.vstack((arr, arr))) # 列扩展
print(np.hstack((arr, arr))) # 行扩展
''' 输出:
[[1 2 3]
[4 5 6]
[1 2 3]
[4 5 6]]
[[1 2 3 1 2 3]
[4 5 6 4 5 6]]
'''
切分
# 矩阵切分
import numpy as np
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
arr_sp = np.split(arr, indices_or_sections=[1])
print(arr_sp)
arr_sp = np.split(arr, indices_or_sections=[1, 2])
print(arr_sp)
arr_sp = np.split(arr, indices_or_sections=[1], axis=0) # 按x轴的水平线切分,与np.vsplit(arr, indices_or_sections=[1])等效
print(arr_sp)
arr_sp = np.split(arr, indices_or_sections=[1], axis=1) # 按y轴的水平线切分,与np.hsplit(arr, indices_or_sections=[1])等效
print(arr_sp)
''' 输出:
[array([[1, 2, 3]]), array([[4, 5, 6],
[7, 8, 9]])]
[array([[1, 2, 3]]), array([[4, 5, 6]]), array([[7, 8, 9]])]
[array([[1, 2, 3]]), array([[4, 5, 6],
[7, 8, 9]])]
[array([[1],
[4],
[7]]), array([[2, 3],
[5, 6],
[8, 9]])]
'''
# 将图片按照y轴水平线切分为左右两块,以实现切片效果
import numpy as np
from PIL import Image
cat = Image.open('./cat.jpg')
cat_data = np.array(cat)
cat_left, cat_right = np.hsplit(cat_data, indices_or_sections=[int(len(cat_data[0]) / 2)])
Image.fromarray(cat_left).show()
Image.fromarray(cat_right).show()
左半边
右半边
副本
# 创建数据副本
import numpy as np
data = np.array([[1, 2, 3], [4, 5, 6]])
arr = np.array(data)
print('arr: \n', arr)
arr[0][0] = 10
print('arr: \n', arr)
arr_cpy = arr.copy()
arr_cpy[0][0] = 20 # 修改副本不会对原矩阵产生影响
print('arr: \n', arr)
print('arr_cpy: \n', arr_cpy)
''' 输出:
arr:
[[1 2 3]
[4 5 6]]
arr:
[[10 2 3]
[ 4 5 6]]
arr:
[[10 2 3]
[ 4 5 6]]
arr_cpy:
[[20 2 3]
[ 4 5 6]]
'''
二、Pandas
1、Pandas简介
Pandas is a Python package that provides fast, flexible, and expressive data structures designed to make working with "relational" or "labeled" data both easy and intuitive. It aims to be the fundamental high-level building block for doing practical, real world data analysis in Python. Additionally, it has the broader goal of becoming the most powerful and flexible open source data analysis / manipulation tool available in any language. It is already well on its way towards this goal.
2、Pandas的基础操作
数据读取
# 读取CSV文件
import pandas as pd
data = pd.read_csv('./greenhouse_gases.csv')
print(data.head(10)) # 读取前10行
''' 输出:
Unnamed: 0 year gas concentration
0 1 20 CO2 277.7
1 2 40 CO2 277.8
2 3 60 CO2 277.3
3 4 80 CO2 277.3
4 5 100 CO2 277.5
5 6 120 CO2 277.6
6 7 140 CO2 278.3
7 8 160 CO2 279.7
8 9 180 CO2 280.4
9 10 200 CO2 280.6
'''
# 读取CSV文件并设置读取格式
import pandas as pd
# sep:分隔符,header:表头行数,names:表头名称
data = pd.read_csv('./greenhouse_gases.csv', sep=',', header=0, names=['index', 'year', 'gas', 'concentration'])
print(data.head(10))
''' 输出:
index year gas concentration
0 1 20 CO2 277.7
1 2 40 CO2 277.8
2 3 60 CO2 277.3
3 4 80 CO2 277.3
4 5 100 CO2 277.5
5 6 120 CO2 277.6
6 7 140 CO2 278.3
7 8 160 CO2 279.7
8 9 180 CO2 280.4
9 10 200 CO2 280.6
'''
Series
# Series
import pandas as pd
s1 = pd.Series([1.2, 'x', 5, 'man'])
print(s1)
''' 输出:
0 1.2
1 x
2 5
3 man
dtype: object
'''
# Series可接收不同参数
import pandas as pd
print('-' * 6, 's1', '-' * 6)
s1 = pd.Series([1.2, 'x', 5, 'man'])
print(s1)
print('s1.index = ', s1.index)
print('s1.values = ', s1.values)
print('-' * 6, 's2', '-' * 6)
s2 = pd.Series([1.2, 'x', 5, 'man'], index=['a', 'b', 'c', 'd']) # index用来指定索引
print(s2)
print('-' * 6, 's3', '-' * 6)
s3 = pd.Series({'a': 1, 'b': 2, 'c': 3, 'd': 4})
print(s3)
''' 输出:
------ s1 ------
0 1.2
1 x
2 5
3 man
dtype: object
s1.index = RangeIndex(start=0, stop=4, step=1)
s1.values = [1.2 'x' 5 'man']
------ s2 ------
a 1.2
b x
c 5
d man
dtype: object
------ s3 ------
a 1
b 2
c 3
d 4
dtype: int64
'''
# Series数据查询
import pandas as pd
s = pd.Series([1.2, 'x', 5, 'man'], index=['a', 'b', 'c', 'd'])
print(s['a'])
print(s[['b', 'a']]) # 查询多个索引
''' 输出:
1.2
b x
a 1.2
dtype: object
'''
# Series数据切片
import pandas as pd
s = pd.Series([1.2, 'x', 5, 'man'], index=['a', 'b', 'c', 'd'])
print(s['a':'c'])
print(s.loc['a':'c'])
print(s.iloc[0:3])
''' 输出:
a 1.2
b x
c 5
dtype: object
a 1.2
b x
c 5
dtype: object
a 1.2
b x
c 5
dtype: object
'''
DataFrame
import numpy as np
import pandas as pd
data = pd.read_csv('./greenhouse_gases.csv')
top10 = data.head(10)
print(top10, end='\n\n')
df = pd.DataFrame(top10.values, index=list('abcdefghij'), columns=['index', 'year_new', 'gas_new', 'concentration_new'])
print(df, end='\n\n')
print(df.dtypes, end='\n\n')
df['index'] = df['index'].astype(np.int64)
print(df.dtypes)
''' 输出:
Unnamed: 0 year gas concentration
0 1 20 CO2 277.7
1 2 40 CO2 277.8
2 3 60 CO2 277.3
3 4 80 CO2 277.3
4 5 100 CO2 277.5
5 6 120 CO2 277.6
6 7 140 CO2 278.3
7 8 160 CO2 279.7
8 9 180 CO2 280.4
9 10 200 CO2 280.6
index year_new gas_new concentration_new
a 1 20 CO2 277.7
b 2 40 CO2 277.8
c 3 60 CO2 277.3
d 4 80 CO2 277.3
e 5 100 CO2 277.5
f 6 120 CO2 277.6
g 7 140 CO2 278.3
h 8 160 CO2 279.7
i 9 180 CO2 280.4
j 10 200 CO2 280.6
index object
year_new object
gas_new object
concentration_new object
dtype: object
index int64
year_new object
gas_new object
concentration_new object
dtype: object
'''
数据查询
# 修改索引及修改列数值类型
import numpy as np
import pandas as pd
df = pd.read_csv('./greenhouse_gases.csv')
print(df.index)
df.set_index('year', inplace=True)
print(df.index, end='\n\n')
print(df.head())
df.loc[:, 'concentration'] = df['concentration'].astype(str).str.replace('.', '').astype(np.int32)
print(df.head())
''' 输出:
RangeIndex(start=0, stop=300, step=1)
Index([ 20, 40, 60, 80, 100, 120, 140, 160, 180, 200,
...
1820, 1840, 1860, 1880, 1900, 1920, 1940, 1960, 1980, 2000],
dtype='int64', name='year', length=300)
Unnamed: 0 gas concentration
year
20 1 CO2 277.7
40 2 CO2 277.8
60 3 CO2 277.3
80 4 CO2 277.3
100 5 CO2 277.5
Unnamed: 0 gas concentration
year
20 1 CO2 2777.0
40 2 CO2 2778.0
60 3 CO2 2773.0
80 4 CO2 2773.0
100 5 CO2 2775.0
'''
# DataFrame的多条件过滤查询
import pandas as pd
df = pd.read_csv('./greenhouse_gases.csv')
print(df.loc[0:5, ['gas', 'concentration']])
df.set_index('year', inplace=True) # 修改索引列
print(df.loc[[20, 40], ['gas', 'concentration']]) # 连续数据筛选
# df[(df.index.values >= 100) & (df.index.values <= 300) & (df['gas'] == 'CO2')] 与下面等效
print(df.loc[(df.index.values >= 100) & (df.index.values <= 300) & (df['gas'] == 'CO2'), :]) # 多条件筛选
''' 输出:
gas concentration
0 CO2 277.7
1 CO2 277.8
2 CO2 277.3
3 CO2 277.3
4 CO2 277.5
5 CO2 277.6
gas concentration
year
20 CO2 277.7
20 CH4 638.1
20 N2O 263.2
40 CO2 277.8
40 CH4 631.1
40 N2O 263.3
Unnamed: 0 gas concentration
year
100 5 CO2 277.5
120 6 CO2 277.6
140 7 CO2 278.3
160 8 CO2 279.7
180 9 CO2 280.4
200 10 CO2 280.6
220 11 CO2 281.3
240 12 CO2 281.4
260 13 CO2 280.6
280 14 CO2 280.0
300 15 CO2 279.8
'''
新增数据列
# 直接赋值(新增一列)
import pandas as pd
df = pd.read_csv('./greenhouse_gases.csv')
df.loc[:, 'concentration2'] = df['concentration'].astype(str) + 'ppm'
print(df)
''' 输出:
Unnamed: 0 year gas concentration concentration2
0 1 20 CO2 277.7 277.7ppm
1 2 40 CO2 277.8 277.8ppm
2 3 60 CO2 277.3 277.3ppm
3 4 80 CO2 277.3 277.3ppm
4 5 100 CO2 277.5 277.5ppm
.. ... ... ... ... ...
295 296 1920 N2O 285.2 285.2ppm
296 297 1940 N2O 287.7 287.7ppm
297 298 1960 N2O 292.3 292.3ppm
298 299 1980 N2O 302.6 302.6ppm
299 300 2000 N2O 315.4 315.4ppm
'''
# 一次新增多列
import pandas as pd
df = pd.read_csv('./greenhouse_gases.csv')
df = df.assign(gasA = lambda x : x['gas'] + 'a', gasB = lambda x : x['gas'] + 'b')
print(df.head())
''' 输出:
Unnamed: 0 year gas concentration gasA gasB
0 1 20 CO2 277.7 CO2a CO2b
1 2 40 CO2 277.8 CO2a CO2b
2 3 60 CO2 277.3 CO2a CO2b
3 4 80 CO2 277.3 CO2a CO2b
4 5 100 CO2 277.5 CO2a CO2b
'''
# 根据自定义函数赋值
import pandas as pd
def get_gas_type(df):
if df['gas'] == 'CH4':
return '甲烷'
elif df['gas'] == 'N2O':
return '一氧化二氮'
elif df['gas'] == 'CO2':
return '二氧化碳'
else:
return '其他温室气体'
df = pd.read_csv('./greenhouse_gases.csv')
df.loc[:, 'gas'] = df.apply(get_gas_type, axis=1)
print(df)
'''
Unnamed: 0 year gas concentration
0 1 20 二氧化碳 277.7
1 2 40 二氧化碳 277.8
2 3 60 二氧化碳 277.3
3 4 80 二氧化碳 277.3
4 5 100 二氧化碳 277.5
.. ... ... ... ...
295 296 1920 一氧化二氮 285.2
296 297 1940 一氧化二氮 287.7
297 298 1960 一氧化二氮 292.3
298 299 1980 一氧化二氮 302.6
299 300 2000 一氧化二氮 315.4
[300 rows x 4 columns]
'''
# 数值统计和计算
import pandas as pd
df = pd.read_csv('./greenhouse_gases.csv')
print(df['gas'].value_counts()) # 统计不同gas列参数的数量
print(df['gas'].unique()) # 获取不同的gas列
''' 输出:
gas
CO2 100
CH4 100
N2O 100
Name: count, dtype: int64
['CO2' 'CH4' 'N2O']
'''
处理丢失数据
# 缺失值判断
import numpy as np
import pandas as pd
df = pd.DataFrame({'Python': [np.nan, 128, 117, None], 'Math': [119, 88, 116, np.NaN]},
index=list('abcd'), columns=['Python', 'Math', 'En'])
print(df, end='\n\n')
print(df.isnull(), end='\n\n') # 判断格子内是否为空数据
print(df.notnull(), end='\n\n') # 判断格子内是否为不为空数据
print(df.any(), end='\n\n') # 某列只要有一个True就是True
print(df.all(), end='\n\n') # 某列都为true才是True
print(df.isnull().any(axis=0), end='\n\n') # 列是否都为null
print(df.isnull().all(axis=1), end='\n\n') # 行是否都为null
''' 输出:
Python Math En
a NaN 119.0 NaN
b 128.0 88.0 NaN
c 117.0 116.0 NaN
d NaN NaN NaN
Python Math En
a True False True
b False False True
c False False True
d True True True
Python Math En
a False True False
b True True False
c True True False
d False False False
Python True
Math True
En False
dtype: bool
Python True
Math True
En True
dtype: bool
Python True
Math True
En True
dtype: bool
a False
b False
c False
d True
dtype: bool
'''
# 删除缺失值,也可参考DataFrame的多条件过滤查询
import numpy as np
import pandas as pd
df = pd.DataFrame({'Python': [np.nan, 128, 117, None], 'Math': [119, 88, 116, np.NaN]},
index=list('abcd'), columns=['Python', 'Math', 'En'])
delList = [x for x in (set(df.index.values) - {'a', 'c'})] # 保留ac
print(df.drop(delList), end='\n\n') # 删除保留外的行
print(df.dropna(), end='\n\n') # 丢弃空值行/列
''' 输出:
Python Math En
a NaN 119.0 NaN
c 117.0 116.0 NaN
Empty DataFrame
Columns: [Python, Math, En]
Index: []
'''
# 填充缺失值
import numpy as np
import pandas as pd
df = pd.DataFrame({'Python': [np.nan, 128, 117, None], 'Math': [119, 88, 116, np.NaN]},
index=list('abcd'), columns=['Python', 'Math', 'En'])
print(df.fillna(value=1, limit=2), end='\n\n') # value:填充值,limit参数:限制填充个数
print(df.fillna(method='ffill')) # pad/ffill:用前一个非缺失值去填充该缺失值 backfill/bfill:用下一个非缺失值填充该缺失值 None:指定一个值去替换缺失值(缺省默认这种方式)
'''
Python Math En
a 1.0 119.0 1.0
b 128.0 88.0 1.0
c 117.0 116.0 NaN
d 1.0 1.0 NaN
Python Math En
a NaN 119.0 NaN
b 128.0 88.0 NaN
c 117.0 116.0 NaN
d 117.0 116.0 NaN
'''
分组和合并
# 分组和合并
import numpy as np
import pandas as pd
df = pd.read_csv('./greenhouse_gases.csv')
group_by_gas = df.groupby('gas', sort=True)
print(group_by_gas.sum(), end='\n\n') # 求和
print(group_by_gas.aggregate('sum'), end='\n\n') # 求和,aggregate()可简写为agg()
print(group_by_gas.aggregate(np.mean), end='\n\n') # 求平均
print(group_by_gas.aggregate(np.std), end='\n\n') # 求标准差
''' 输出:
Unnamed: 0 year concentration
gas
CH4 15050 101000 69788.3
CO2 5050 101000 28212.4
N2O 25050 101000 26847.2
Unnamed: 0 year concentration
gas
CH4 15050 101000 69788.3
CO2 5050 101000 28212.4
N2O 25050 101000 26847.2
Unnamed: 0 year concentration
gas
CH4 150.5 1010.0 697.883
CO2 50.5 1010.0 282.124
N2O 250.5 1010.0 268.472
Unnamed: 0 year concentration
gas
CH4 29.011492 580.22984 154.027558
CO2 29.011492 580.22984 11.876368
N2O 29.011492 580.22984 7.761210
'''
表格匹配与拼接
# 表格拼接concat()
import pandas as pd
df1 = pd.DataFrame({'apts': [55000, 60000], 'cars': [200000, 300000]}, index=['Shanghai', 'Beijing'])
df2 = pd.DataFrame({'apts': [150000, 120000], 'cars': [25000, 20000]}, index=['Hangzhou', 'Nanjing'])
df3 = pd.DataFrame({'apts': [30000, 10000], 'cars': [100000, 100000]}, index=['Guangzhou', 'Chongqing'])
frames = pd.concat([df1, df2, df3], keys=['df1', 'df2', 'df3'], sort=False) # key:添加区分关键字
print(frames)
''' 输出:
apts cars
df1 Shanghai 55000 200000
Beijing 60000 300000
df2 Hangzhou 150000 25000
Nanjing 120000 20000
df3 Guangzhou 30000 100000
Chongqing 10000 100000
'''
# 表格拼接_append()
import pandas as pd
df1 = pd.DataFrame({'apts': [55000, 60000], 'cars': [200000, 300000]}, index=['Shanghai', 'Beijing'])
df2 = pd.DataFrame({'apts': [150000, 120000], 'cars': [25000, 20000]}, index=['Hangzhou', 'Nanjing'])
print(df1._append(df2, sort=False))
''' 输出:
apts cars
Shanghai 55000 200000
Beijing 60000 300000
Hangzhou 150000 25000
Nanjing 120000 20000
'''
# 表格拼接merge()
import pandas as pd
df1 = pd.DataFrame(
{'apts': [55000, 60000, 58000], 'cars': [200000, 300000, 250000], 'city': ['Shanghai', 'Beijing', 'Shenzhen']})
df2 = pd.DataFrame(
{'salaries': [10000, 30000, 3000, 2000, 15000], 'city': ['Suzhou', 'Beijing', 'Shanghai', 'Guangzhou', 'Tianjin']})
print(df1, end='\n\n')
print(df2, end='\n\n')
print(pd.merge(df1, df2, on='city', how='inner'), end='\n\n') # left:左连接, right:右连接, inner:内连接, outer:全连接, cross:交叉连接
print(pd.concat([df1.set_index('city'), df2.set_index('city')], sort=False, axis=1, join='inner')) # concat()可以达到同样效果
''' 输出:
apts cars city
0 55000 200000 Shanghai
1 60000 300000 Beijing
2 58000 250000 Shenzhen
salaries city
0 10000 Suzhou
1 30000 Beijing
2 3000 Shanghai
3 2000 Guangzhou
4 15000 Tianjin
apts cars city salaries
0 55000 200000 Shanghai 3000
1 60000 300000 Beijing 30000
apts cars salaries
city
Shanghai 55000 200000 3000
Beijing 60000 300000 30000
'''
# 表格拼接join()
import pandas as pd
df1 = pd.DataFrame(
{'apts': [55000, 60000, 58000], 'cars': [200000, 300000, 250000], 'city': ['Shanghai', 'Beijing', 'Shenzhen']})
df2 = pd.DataFrame(
{'salaries': [10000, 30000, 3000, 2000, 15000], 'city': ['Suzhou', 'Beijing', 'Shanghai', 'Guangzhou', 'Tianjin']})
print(df1.join(df2, how='left', lsuffix='left', rsuffix='right'))
''' 输出:
apts cars cityleft salaries cityright
0 55000 200000 Shanghai 10000 Suzhou
1 60000 300000 Beijing 30000 Beijing
2 58000 250000 Shenzhen 3000 Shanghai
'''