pandas cheat sheet

import numpy as np
import pandas as pd

读取文件

1
2
pd.read_csv("d.csv",sep=',',index_col=False)
excelsource=pd.read_excel('111.xlsx', index_col=None)

月份

1
pd.date_range(start='2018/01/01', end='2018/07/01',freq='M')

导出数据

result.to_csv(“out.txt”)

显示打印行数

pd.set_option(‘display.max_rows’, 20)

精度

pd.option_context(‘display.precision’, 10)

打印宽度

pd.set_option(‘display.width’, 300)

条件修改列数据

result[‘res’][result[‘sum’]<=0]=result[‘AMT’]
result[‘res’][result[‘sum’]>0]=result[‘AMT’]-result[‘sum’]

union

pd.concat([df1,df2])

join

pd.merge(result,thismonthdf,on=”ACCOUNT_ID”,how=’left’)

pd.merge(excelsource,dbsource,left_on=”passId”,right_on=’pass_id’,how=’left’)

groupby

dftemp.groupby(dftemp[‘LOAN_ID’]).agg({“PRODUCT_ID”:’first’,”AMT”:np.sum}).reset_index()

如果reset_index时,column有同名列,就指定drop=True

aaa.reset_index(drop=True)

sort and groupby

data.sort_values(by=’occurtime’, ascending=False).groupby([‘bid’]).first()
data.sort_values(by=’occurtime’, ascending=False).groupby([‘bid’]).head(1)

对比数据差异

fcore[‘test’]=0
result = pd.merge(asset,fcore,on=[‘LOAN_ID’,’REQUEST_ID’],how=’left’)
result[result[‘test’]!=0]

1
result = pd.merge(ALL,BLACK,on=['key'],how='outer')

去重

data21.drop_duplicates()

差集

subset为去重的列,keep为false表示重复的直接删掉而不是distinct

1
WHITE=huaihai.append(BLACK).append(BLACK).append(GREY).append(GREY).drop_duplicates(subset=['key'],keep=False)

文件输出

pd.DataFrame(result[result[‘res’]>0], columns=[‘CUSTOMER_ID’,’ACCOUNT_ID’,’FINANCING_SOURCE’,’res’]).to_csv(“./credit/out_”+stopDate.strftime(“%Y-%m-%d”)+”.tsv”,sep=’\t’)

filter

df2[df2[‘E’].isin([‘two’,’four’])]
bacdata[(bacdata[‘CHARGES’]<0) | (bacdata[‘INTEREST’]<0)]
b[(b[‘complete_day’]>=’2019-02-27’) & (b[‘complete_day’]<’2019-02-28’) & (b[‘timepoint’].astype(int)<2)]

筛选 null 数据

temp[temp[‘1’].isnull()]
temp[temp[‘1’].notnull()]

check if exists

1094961539513730115 in fcoreData[‘LOAN_ID’].values
fcoreData[‘LOAN_ID’].values.tolist()

遍历每一行

def _map(data, exp):
for index, row in data.iterrows(): # 获取每行的index、row
for col_name in data.columns:
row[col_name] = exp(row[col_name]) # 把结果返回给data
return data

#遍历修改
for i, trial in dfTrials.iterrows():
dfTrials.loc[i, “response”] = “answer {}”.format(trial[“no”])

https://blog.csdn.net/ls13552912394/article/details/79349809

修改某一列的值
b.iloc[2][‘complete_day’]=’2019-02-27’

多列加和,axis表示列
b.apply(lambda x: int(x[‘timepoint’]) + int(x[‘branch_type’]), axis=1)

#时间相关
today = datetime.strptime(datetime.today().strftime(“%Y-%m-%d”), ‘%Y-%m-%d’)
datetime.combine(row[‘CLEAR_DATE’], datetime.min.time())
datetime(2016,9,3).date()
raw_data[‘Mycol’] = pd.to_datetime(raw_data[‘Mycol’], format=’%Y-%m-%d %H:%M:%S.%f’)

math.isnan()

画图

%matplotlib inline
result.plot()

类型转换

df[‘col2’] = df[‘col2’].astype(‘int’)
print ‘———–’
print df.dtypes
df[‘col2’] = df[‘col2’].astype(‘float64’)
print ‘———–’
print df.dtypes

精度问题

555.55*100=55554.999999999
所以得 round 一下
details[‘PRINCIPAL’]=details[‘PRINCIPAL’].round()

df.index.name = ‘foo’