需求:
現(xiàn)有一個 csv文件,包含'CNUM'和'COMPANY'兩列,數(shù)據(jù)里包含空行,且有內(nèi)容重復的行數(shù)據(jù)。
要求:
1)去掉空行;
2)重復行數(shù)據(jù)只保留一行有效數(shù)據(jù);
3)修改'COMPANY'列的名稱為'Company_New‘;
4)并在其后增加六列,分別為'C_col',‘D_col',‘E_col',‘F_col',‘G_col',‘H_col'。
一,使用 Python Pandas來處理:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
import pandas as pd import numpy as np from pandas import DataFrame,Series def deal_with_data(filepath,newpath): file_obj = open (filepath) df = pd.read_csv(file_obj) # 讀取csv文件,創(chuàng)建 DataFrame df = df.reindex(columns = [ 'CNUM' , 'COMPANY' , 'C_col' , 'D_col' , 'E_col' , 'F_col' , 'G_col' , 'H_col' ],fill_value = None ) # 重新指定列索引 df.rename(columns = { 'COMPANY' : 'Company_New' }, inplace = True ) # 修改列名 df = df.dropna(axis = 0 ,how = 'all' ) # 去除 NAN 即文件中的空行 df[ 'CNUM' ] = df[ 'CNUM' ].astype( 'int32' ) # 將 CNUM 列的數(shù)據(jù)類型指定為 int32 df = df.drop_duplicates(subset = [ 'CNUM' , 'Company_New' ], keep = 'first' ) # 去除重復行 df.to_csv(newpath,index = False ,encoding = 'GBK' ) file_obj.close() if __name__ = = '__main__' : file_path = r 'C:UsersP78DesktoppythonCNUM_COMPANY.csv' file_save_path = r 'C:UsersP78DesktoppythonCNUM_COMPANY_OUTPUT.csv' deal_with_data(file_path,file_save_path) |
二,使用 VBA來處理:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
|
Option Base 1 Option Explicit Sub main() On Error GoTo error_handling Dim wb As Workbook Dim wb_out As Workbook Dim sht As Worksheet Dim sht_out As Worksheet Dim rng As Range Dim usedrows As Byte Dim usedrows_out As Byte Dim dict_cnum_company As Object Dim str_file_path As String Dim str_new_file_path As String 'assign values to variables: str_file_path = "C:UsersP78DesktopPythonCNUM_COMPANY.csv" str_new_file_path = "C:UsersP78DesktopPythonCNUM_COMPANY_OUTPUT.csv" Set wb = checkAndAttachWorkbook(str_file_path) Set sht = wb.Worksheets( "CNUM_COMPANY" ) Set wb_out = Workbooks.Add wb_out.SaveAs str_new_file_path, xlCSV 'create a csv file Set sht_out = wb_out.Worksheets( "CNUM_COMPANY_OUTPUT" ) Set dict_cnum_company = CreateObject( "Scripting.Dictionary" ) usedrows = WorksheetFunction. Max (getLastValidRow(sht, "A" ), getLastValidRow(sht, "B" )) 'rename the header ' COMPANY ' to ' Company_New',remove blank & duplicate lines / rows. Dim cnum_company As String cnum_company = "" For Each rng In sht. Range ( "A1" , "A" & usedrows) If VBA.Trim(rng.Offset( 0 , 1 ).Value) = "COMPANY" Then rng.Offset( 0 , 1 ).Value = "Company_New" End If cnum_company = rng.Value & "-" & rng.Offset( 0 , 1 ).Value If VBA.Trim(cnum_company) <> "-" And Not dict_cnum_company.Exists(rng.Value & "-" & rng.Offset( 0 , 1 ).Value) Then dict_cnum_company.Add rng.Value & "-" & rng.Offset( 0 , 1 ).Value, "" End If Next rng 'loop the keys of dict split the keyes by ' - ' into cnum array and company array. Dim index_dict As Byte Dim arr_cnum() Dim arr_Company() For index_dict = 0 To UBound(dict_cnum_company.keys) ReDim Preserve arr_cnum( 1 To UBound(dict_cnum_company.keys) + 1 ) ReDim Preserve arr_Company( 1 To UBound(dict_cnum_company.keys) + 1 ) arr_cnum(index_dict + 1 ) = Split(dict_cnum_company.keys()(index_dict), "-" )( 0 ) arr_Company(index_dict + 1 ) = Split(dict_cnum_company.keys()(index_dict), "-" )( 1 ) Debug. Print index_dict Next 'assigns the value of the arrays to the celles. sht_out. Range ( "A1" , "A" & UBound(arr_cnum)) = Application.WorksheetFunction.Transpose(arr_cnum) sht_out. Range ( "B1" , "B" & UBound(arr_Company)) = Application.WorksheetFunction.Transpose(arr_Company) 'add 6 columns to output csv file : Dim arr_columns() As Variant arr_columns = Array( "C_col" , "D_col" , "E_col" , "F_col" , "G_col" , "H_col" ) ' sht_out. Range ( "C1:H1" ) = arr_columns Call checkAndCloseWorkbook(str_file_path, False ) Call checkAndCloseWorkbook(str_new_file_path, True ) Exit Sub error_handling: Call checkAndCloseWorkbook(str_file_path, False ) Call checkAndCloseWorkbook(str_new_file_path, False ) End Sub ' 輔助函數(shù): 'Get last row of Column N in a Worksheet Function getLastValidRow(in_ws As Worksheet, in_col As String) getLastValidRow = in_ws.Cells(in_ws.Rows.count, in_col).End(xlUp).Row End Function Function checkAndAttachWorkbook(in_wb_path As String) As Workbook Dim wb As Workbook Dim mywb As String mywb = in_wb_path For Each wb In Workbooks If LCase(wb.FullName) = LCase(mywb) Then Set checkAndAttachWorkbook = wb Exit Function End If Next Set wb = Workbooks. Open (in_wb_path, UpdateLinks: = 0 ) Set checkAndAttachWorkbook = wb End Function Function checkAndCloseWorkbook(in_wb_path As String, in_saved As Boolean) Dim wb As Workbook Dim mywb As String mywb = in_wb_path For Each wb In Workbooks If LCase(wb.FullName) = LCase(mywb) Then wb.Close savechanges: = in_saved Exit Function End If Next End Function |
三,輸出結果:
兩種方法輸出結果相同:
四,比較總結:
Python pandas 內(nèi)置了大量處理數(shù)據(jù)的方法,我們不需要重復造輪子,用起來很方便,代碼簡潔的多。
Excel VBA 處理這個需求,使用了 數(shù)組,字典等數(shù)據(jù)結構(實際需求中,數(shù)據(jù)量往往很大,所以一些地方?jīng)]有直接使用遍歷單元格的方法),以及處理字符串,數(shù)組和字典的很多方法,對文件的操作也很復雜,一旦出錯,調試起來比python也較困難,代碼已經(jīng)盡量優(yōu)化,但還是遠比 Python要多。
到此這篇關于VBA處理數(shù)據(jù)與Python Pandas處理數(shù)據(jù)案例比較分析的文章就介紹到這了,更多相關VBA與Python Pandas處理數(shù)據(jù)內(nèi)容請搜索服務器之家以前的文章或繼續(xù)瀏覽下面的相關文章希望大家以后多多支持服務器之家!
原文鏈接:https://blog.csdn.net/qq_24937551/article/details/105338086