博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
Pandas 基本技巧
阅读量:4544 次
发布时间:2019-06-08

本文共 11360 字,大约阅读时间需要 37 分钟。

1.数据查看和转置

import numpy as npimport pandas as pd  # 导入numpy、pandas模块# 数据查看、转置df = pd.DataFrame(np.random.rand(16).reshape(8,2)*100,                   columns = ['a','b'])print(df.head(2))  #查看前两条数据print(df.tail())# .head()查看头部数据# .tail()查看尾部数据# 默认查看5条print(df.T)# .T 转置

输出结果:

a          b0  64.231620  24.2229541   3.004779  92.549576           a          b3  54.787062  17.2645774  13.106864   5.5006185   8.631310  79.1093556  22.107241  94.9016857  29.034599  54.156278           0          1          2          3          4          5  \a  64.231620   3.004779  25.002825  54.787062  13.106864   8.631310   b  24.222954  92.549576  87.818090  17.264577   5.500618  79.109355              6          7  a  22.107241  29.034599  b  94.901685  54.156278

2.(1)添加与修改_1

# 添加与修改df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,                   columns = ['a','b','c','d'])print(df)df['e'] = 10df.loc[4] = 20print(df)# 新增列/行并赋值df['e'] = 20df[['a','c']] = 100print(df)# 索引后直接修改值#注意:不能同时添加两列,否则会报错,如:df[['f','g']] = 200 ,必须一列一列的添加

输出结果:

a          b          c          d0  14.342082  52.604100  26.561995  60.4417311  20.331108  43.537490   1.020098   7.1714182  35.226542   9.573718  99.273254   0.8672273  47.511549  56.783730  47.580639  67.007725           a          b          c          d   e0  14.342082  52.604100  26.561995  60.441731  101  20.331108  43.537490   1.020098   7.171418  102  35.226542   9.573718  99.273254   0.867227  103  47.511549  56.783730  47.580639  67.007725  104  20.000000  20.000000  20.000000  20.000000  20     a          b    c          d   e0  100  52.604100  100  60.441731  201  100  43.537490  100   7.171418  202  100   9.573718  100   0.867227  203  100  56.783730  100  67.007725  204  100  20.000000  100  20.000000  20

(2)添加与修改_2

import numpy as npimport pandas as pddf = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,                   columns = ['a','b','c','d'])df.iloc[0] = 100print(df)df.iloc[0] = [1,2,3,4]print(df)#增加一行尽量曲用loc去增加,iloc是不能增加的,会报错df.loc[5] = 100print(df)

输出结果:

a           b           c           d0  100.000000  100.000000  100.000000  100.0000001   93.941010    7.951216   77.744847   66.8421142   72.795874   40.031626   22.842638   92.8764583   40.474858   53.663771   48.452597   66.444382           a          b          c          d0   1.000000   2.000000   3.000000   4.0000001  93.941010   7.951216  77.744847  66.8421142  72.795874  40.031626  22.842638  92.8764583  40.474858  53.663771  48.452597  66.444382            a           b           c           d0    1.000000    2.000000    3.000000    4.0000001   93.941010    7.951216   77.744847   66.8421142   72.795874   40.031626   22.842638   92.8764583   40.474858   53.663771   48.452597   66.4443825  100.000000  100.000000  100.000000  100.000000

3.删除

(1)

# 删除  del / drop()df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,                   columns = ['a','b','c','d'])print(df)del df['a']print(df)print('-----')# del语句 - 删除列#注意:删除行的时候不能用del df.loc[index]或者df.iloc[index]  否则会报错 可以变相的删除 如删除第一行 可令df = df.iloc[1:]print(df.drop(0))print(df.drop([1,2]))print(df)print('-----')# drop()删除行,inplace=False → 删除后生成新的数据,不改变原数据print(df.drop(['d'], axis = 1)) #axis =0 的时候删除行print(df)# drop()删除列,需要加上axis = 1,inplace=False → 删除后生成新的数据,不改变原数据

输出结果:

a          b          c          d0  71.238538   6.121303  77.988034  44.0470091  34.018365  78.192855  50.467246  81.1623372  86.311980  44.341469  49.789445  35.6576653  78.073272  31.457479  74.385014  24.655976           b          c          d0   6.121303  77.988034  44.0470091  78.192855  50.467246  81.1623372  44.341469  49.789445  35.6576653  31.457479  74.385014  24.655976-----           b          c          d1  78.192855  50.467246  81.1623372  44.341469  49.789445  35.6576653  31.457479  74.385014  24.655976           b          c          d0   6.121303  77.988034  44.0470093  31.457479  74.385014  24.655976           b          c          d0   6.121303  77.988034  44.0470091  78.192855  50.467246  81.1623372  44.341469  49.789445  35.6576653  31.457479  74.385014  24.655976-----           b          c0   6.121303  77.9880341  78.192855  50.4672462  44.341469  49.7894453  31.457479  74.385014           b          c          d0   6.121303  77.988034  44.0470091  78.192855  50.467246  81.1623372  44.341469  49.789445  35.6576653  31.457479  74.385014  24.655976

(2)

import numpy as npimport pandas as pddf = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,                   columns = ['a','b','c','d'])print(df.drop(0)) print(df)  #源数据不会改变print(df.drop(0,inplace = True))  #这个方法改变了源数据,并不生成新的值了,所以输出为空print(df)  #有inplace 参数的时候就替换了源数据

输出结果:

a          b          c          d1  78.187118  19.237655  94.443127  67.4665322  37.921956  84.157197  23.311418  24.1282223  12.330334   6.034799  62.023747  28.034041           a          b          c          d0  60.558857  94.367826  88.690379  33.9573801  78.187118  19.237655  94.443127  67.4665322  37.921956  84.157197  23.311418  24.1282223  12.330334   6.034799  62.023747  28.034041None           a          b          c          d1  78.187118  19.237655  94.443127  67.4665322  37.921956  84.157197  23.311418  24.1282223  12.330334   6.034799  62.023747  28.034041

4.对齐

# 对齐df1 = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])print(df1)print(df2)print(df1 + df2) #有共同的列名和共同的标签的话 就会相加 。没有共同的部分就会变为空值。任何值和空值进行运算都会变为空值# DataFrame对象之间的数据自动按照列和索引(行标签)对齐 ,

输出结果:

A         B         C         D0 -1.528903  0.519125 -0.214881 -0.5917751 -0.334501 -0.837666  0.568927 -0.5992372  0.753145  0.569262 -1.181976  1.2253633 -0.177136 -0.367530  0.382826  1.4475914  0.215967 -0.612947  0.844906  0.1304145  0.414375 -0.207225  0.140776  1.0866866  0.008855  2.873956 -0.650806 -2.6314857 -0.634085  0.625107  0.046198 -0.3523438  0.646812  0.928476  0.519168 -0.6449979 -0.697006 -0.178875  0.856392 -0.512101          A         B         C0 -0.373297  0.607873  0.1200161  0.343563 -2.901778 -0.3700512  0.428568  0.319359 -3.2635853  1.042845 -0.314763 -0.1988164  0.071258 -0.484855  0.5631275 -2.270312 -0.145558  0.9312036  2.493652 -0.232491 -0.216451          A         B         C   D0 -1.902200  1.126998 -0.094865 NaN1  0.009061 -3.739444  0.198876 NaN2  1.181713  0.888620 -4.445561 NaN3  0.865710 -0.682293  0.184010 NaN4  0.287224 -1.097802  1.408034 NaN5 -1.855938 -0.352783  1.071979 NaN6  2.502507  2.641465 -0.867257 NaN7       NaN       NaN       NaN NaN8       NaN       NaN       NaN NaN9       NaN       NaN       NaN NaN

6.排序

(1)按值排序

# 排序1 - 按值排序 .sort_values# 同样适用于Seriesdf1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,                   columns = ['a','b','c','d'])print(df1)print(df1.sort_values(['a'], ascending = True))  # 升序#也可以这样写:print(df1.sort_values(by = 'a',ascending = True))print(df1.sort_values(['a'], ascending = False))  # 降序print('------')# ascending参数:设置升序降序,默认升序# 单列排序df2 = pd.DataFrame({
'a':[1,1,1,1,2,2,2,2], 'b':list(range(8)), 'c':list(range(8,0,-1))})print(df2)print(df2.sort_values(['a','c']))# 多列排序,按列顺序排序# 注意inplace参数

输出结果:

a          b          c          d0  28.598118   8.037050  51.856085  45.8594141  91.412263  59.797819  27.912198   6.9968832  92.001255  76.467245  76.524894  33.4638363  47.054750  37.376781  94.286800  53.429360           a          b          c          d0  28.598118   8.037050  51.856085  45.8594143  47.054750  37.376781  94.286800  53.4293601  91.412263  59.797819  27.912198   6.9968832  92.001255  76.467245  76.524894  33.463836           a          b          c          d2  92.001255  76.467245  76.524894  33.4638361  91.412263  59.797819  27.912198   6.9968833  47.054750  37.376781  94.286800  53.4293600  28.598118   8.037050  51.856085  45.859414------   a  b  c0  1  0  81  1  1  72  1  2  63  1  3  54  2  4  45  2  5  36  2  6  27  2  7  1   a  b  c3  1  3  52  1  2  61  1  1  70  1  0  87  2  7  16  2  6  25  2  5  34  2  4  4

(2)索引排序

# 排序2 - 索引排序 .sort_indexdf1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,                  index = [5,4,3,2],                   columns = ['a','b','c','d'])df2 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,                  index = ['h','s','x','g'],                   columns = ['a','b','c','d'])print(df1)print(df1.sort_index())print(df2)print(df2.sort_index())# 按照index排序# 默认 ascending=True, inplace=False

输出结果:

a          b          c          d5  80.932585  71.991854  64.582943  23.4432314  82.054030  87.459058  12.108433  83.0474903  56.329863  14.926822  47.884418  59.8803522   0.347007  69.794103  74.375345  12.736429           a          b          c          d2   0.347007  69.794103  74.375345  12.7364293  56.329863  14.926822  47.884418  59.8803524  82.054030  87.459058  12.108433  83.0474905  80.932585  71.991854  64.582943  23.443231           a          b          c          dh  53.041921  93.834097  13.423132  82.702020s   0.003814  75.721426  73.086606  20.597472x  32.678307  58.369155  70.487505  24.833117g  46.232889  19.365147   9.872537  98.246438           a          b          c          dg  46.232889  19.365147   9.872537  98.246438h  53.041921  93.834097  13.423132  82.702020s   0.003814  75.721426  73.086606  20.597472x  32.678307  58.369155  70.487505  24.833117

(3)

df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,                  index = [5,4,3,2],                   columns = ['a','b','c','d'])print(df1)print(df1.sort_index())print(df1)  # df1并没有变print(df1.sort_index(inplace = True))print(df1)  # df1发生改变

输出结果:

a          b          c          d5  45.004735  23.449962  52.756124  60.2371414  74.945903  63.813663  29.937821  66.4204153  45.737208  82.376775  80.615108  40.4790942  41.743173  82.013411  83.372130  76.195150           a          b          c          d2  41.743173  82.013411  83.372130  76.1951503  45.737208  82.376775  80.615108  40.4790944  74.945903  63.813663  29.937821  66.4204155  45.004735  23.449962  52.756124  60.237141           a          b          c          d5  45.004735  23.449962  52.756124  60.2371414  74.945903  63.813663  29.937821  66.4204153  45.737208  82.376775  80.615108  40.4790942  41.743173  82.013411  83.372130  76.195150None           a          b          c          d2  41.743173  82.013411  83.372130  76.1951503  45.737208  82.376775  80.615108  40.4790944  74.945903  63.813663  29.937821  66.4204155  45.004735  23.449962  52.756124  60.237141

练习:

作业1:创建一个3*3,值在0-100区间随机值的Dataframe(如图),分别按照index和第二列值大小,降序排序

import numpy as npimport pandas as pd#练习1# df = pd.DataFrame(np.random.rand(9).reshape(3,3)*100,#                   index=['a','b','c'],#                   columns=['v1','v2','v3'])# print(df)## print(df.sort_index())# df.sort_values(by = 'v2',ascending= False,inplace = True)# print(df)

作业2:创建一个5*2,值在0-100区间随机值的Dataframe(如图)df1,通过修改得到df2

#练习2# df1 = pd.DataFrame(np.random.rand(10).reshape(5,2)*100,#                   index=['a','b','c','d','e'],#                   columns=['v1','v2'])# print(df1)# print(df1.drop(['e'],axis = 0).T)

作业3:如图创建Series,并按照要求修改得到结果

#练习3df2 = pd.Series(np.arange(10),index= ['a','b','c','d','e','f','g','h','i','j'])print(df2)df2.loc[['a','e','f']] = 100print(df2)#或者# df2.iloc[0] = 100# df2.iloc[3] = 100# df2.iloc[4] = 100

转载于:https://www.cnblogs.com/carlber/p/9918208.html

你可能感兴趣的文章
JAVA链表简单实现
查看>>
[转载]T-SQL(MSSQL)语句查询执行顺序
查看>>
SignalR 行实时通信最大连接数
查看>>
开发进度6
查看>>
php方法重载
查看>>
三次握手和四次挥手(二)
查看>>
MySQL中的索引
查看>>
Android开发之手势滑动(滑动手势监听)详解
查看>>
switch
查看>>
HTTP错误code大全
查看>>
PAT Advanced Level 1043
查看>>
C++重载运算符练习--对people类重载“= =”运算符和“=”运算符
查看>>
Nmap命令的实用范例
查看>>
7-1 查找整数编程总结
查看>>
安装PHP以及搭建博客(一)
查看>>
关于WORD文档的读取乱码问题
查看>>
[问题记录.dotnet]取网卡信息报错"找不到"-WMI - Not found
查看>>
Codeforces Round #254 (Div. 2):B. DZY Loves Chemistry
查看>>
linux 安装虚拟机
查看>>
Thinkphp5笔记二:创建模块
查看>>