import numpy as np
import pandas as pd
读取文件
1 | pd.read_csv("d.csv",sep=',',index_col=False) |
月份
1 | pd.date_range(start='2018/01/01', end='2018/07/01',freq='M') |
导出数据
result.to_csv(“out.txt”)
显示打印行数
pd.set_option(‘display.max_rows’, 20)
精度
pd.option_context(‘display.precision’, 10)
打印宽度
pd.set_option(‘display.width’, 300)
条件修改列数据
result[‘res’][result[‘sum’]<=0]=result[‘AMT’]
result[‘res’][result[‘sum’]>0]=result[‘AMT’]-result[‘sum’]
union
pd.concat([df1,df2])
join
pd.merge(result,thismonthdf,on=”ACCOUNT_ID”,how=’left’)
pd.merge(excelsource,dbsource,left_on=”passId”,right_on=’pass_id’,how=’left’)
groupby
dftemp.groupby(dftemp[‘LOAN_ID’]).agg({“PRODUCT_ID”:’first’,”AMT”:np.sum}).reset_index()
如果reset_index时,column有同名列,就指定drop=True
aaa.reset_index(drop=True)
sort and groupby
data.sort_values(by=’occurtime’, ascending=False).groupby([‘bid’]).first()
data.sort_values(by=’occurtime’, ascending=False).groupby([‘bid’]).head(1)
对比数据差异
fcore[‘test’]=0
result = pd.merge(asset,fcore,on=[‘LOAN_ID’,’REQUEST_ID’],how=’left’)
result[result[‘test’]!=0]
1 | result = pd.merge(ALL,BLACK,on=['key'],how='outer') |
去重
data21.drop_duplicates()
差集
subset为去重的列,keep为false表示重复的直接删掉而不是distinct
1 | WHITE=huaihai.append(BLACK).append(BLACK).append(GREY).append(GREY).drop_duplicates(subset=['key'],keep=False) |
文件输出
pd.DataFrame(result[result[‘res’]>0], columns=[‘CUSTOMER_ID’,’ACCOUNT_ID’,’FINANCING_SOURCE’,’res’]).to_csv(“./credit/out_”+stopDate.strftime(“%Y-%m-%d”)+”.tsv”,sep=’\t’)
filter
df2[df2[‘E’].isin([‘two’,’four’])]
bacdata[(bacdata[‘CHARGES’]<0) | (bacdata[‘INTEREST’]<0)]
b[(b[‘complete_day’]>=’2019-02-27’) & (b[‘complete_day’]<’2019-02-28’) & (b[‘timepoint’].astype(int)<2)]
筛选 null 数据
temp[temp[‘1’].isnull()]
temp[temp[‘1’].notnull()]
check if exists
1094961539513730115 in fcoreData[‘LOAN_ID’].values
fcoreData[‘LOAN_ID’].values.tolist()
遍历每一行
def _map(data, exp):
for index, row in data.iterrows(): # 获取每行的index、row
for col_name in data.columns:
row[col_name] = exp(row[col_name]) # 把结果返回给data
return data
#遍历修改
for i, trial in dfTrials.iterrows():
dfTrials.loc[i, “response”] = “answer {}”.format(trial[“no”])
https://blog.csdn.net/ls13552912394/article/details/79349809
修改某一列的值
b.iloc[2][‘complete_day’]=’2019-02-27’
多列加和,axis表示列
b.apply(lambda x: int(x[‘timepoint’]) + int(x[‘branch_type’]), axis=1)
#时间相关
today = datetime.strptime(datetime.today().strftime(“%Y-%m-%d”), ‘%Y-%m-%d’)
datetime.combine(row[‘CLEAR_DATE’], datetime.min.time())
datetime(2016,9,3).date()
raw_data[‘Mycol’] = pd.to_datetime(raw_data[‘Mycol’], format=’%Y-%m-%d %H:%M:%S.%f’)
math.isnan()
画图
%matplotlib inline
result.plot()
类型转换
df[‘col2’] = df[‘col2’].astype(‘int’)
print ‘———–’
print df.dtypes
df[‘col2’] = df[‘col2’].astype(‘float64’)
print ‘———–’
print df.dtypes
精度问题
555.55*100=55554.999999999
所以得 round 一下
details[‘PRINCIPAL’]=details[‘PRINCIPAL’].round()
df.index.name = ‘foo’