1414
1515
1616class URLData :
17-
1817 def __init__ (self , url = None , pv = None , ratio = None , peak = None ):
1918 self .url = url
2019 self .pv = pv
@@ -30,23 +29,23 @@ def parse_log_format():
3029 log_format_list = config .log_format .split ()
3130 for item in log_format_list :
3231 if item == 'ip' :
33- log_format_index .setdefault ('ip_index' , log_format_list .index (item )+ 1 )
32+ log_format_index .setdefault ('ip_index' , log_format_list .index (item ) + 1 )
3433 if item == 'real_ip' :
35- log_format_index .setdefault ('real_ip_index' , log_format_list .index (item )+ 1 )
34+ log_format_index .setdefault ('real_ip_index' , log_format_list .index (item ) + 1 )
3635 if item == 'datetime' :
37- log_format_index .setdefault ('time_index' , log_format_list .index (item )+ 1 )
36+ log_format_index .setdefault ('time_index' , log_format_list .index (item ) + 1 )
3837 if item == 'url' :
39- log_format_index .setdefault ('url_index' , log_format_list .index (item )+ 1 )
38+ log_format_index .setdefault ('url_index' , log_format_list .index (item ) + 1 )
4039 if item == 'method' :
41- log_format_index .setdefault ('method_index' , log_format_list .index (item )+ 1 )
40+ log_format_index .setdefault ('method_index' , log_format_list .index (item ) + 1 )
4241 if item == 'protocol' :
43- log_format_index .setdefault ('protocol_index' , log_format_list .index (item )+ 1 )
42+ log_format_index .setdefault ('protocol_index' , log_format_list .index (item ) + 1 )
4443 if item == 'cost' :
45- log_format_index .setdefault ('cost_time_index' , log_format_list .index (item )+ 1 )
44+ log_format_index .setdefault ('cost_time_index' , log_format_list .index (item ) + 1 )
4645 if 'real_ip_index' in log_format_index .keys ():
47- log_format_index .setdefault ('host_index' , log_format_list .index ('real_ip' )+ 1 )
46+ log_format_index .setdefault ('host_index' , log_format_list .index ('real_ip' ) + 1 )
4847 else :
49- log_format_index .setdefault ('host_index' , log_format_list .index ('ip' )+ 1 )
48+ log_format_index .setdefault ('host_index' , log_format_list .index ('ip' ) + 1 )
5049 return log_format_index
5150
5251
@@ -113,7 +112,7 @@ def parse_log_file(target_file, log_format):
113112 pattern = re .compile (config .log_pattern )
114113
115114 # 第一次读取整个文件,获取对应的请求时间、请求URL、请求方法、用户IP、请求响应时间等数据
116- with open ('../data/' + target_file , 'r' ) as f :
115+ with open ('../data/' + target_file , 'r' ) as f :
117116 for line in f :
118117 match = pattern .match (line )
119118 if match is None :
@@ -138,17 +137,19 @@ def parse_log_file(target_file, log_format):
138137 method_counts ['post' ] += 1
139138 if method == 'GET' :
140139 method_counts ['get' ] += 1
141- protocol = match .group (log_format .get ('protocol_index' ))
142- urls .append (method + ' ' + url + ' ' + protocol )
140+ urls .append (method + ' ' + url )
143141 if 'cost_time_index' in log_format .keys ():
144- cost_time_list .append ({'time' : log_time , 'cost_time' : int (float (match .group (log_format .get ('cost_time_index' )))* 1000 )})
142+ if cost_time_flag :
143+ cost_time_list .append ({'time' : log_time , 'cost_time' : int (float (match .group (log_format .get ('cost_time_index' ))) * 1000 )})
144+ else :
145+ cost_time_list .append ({'time' : '' , 'cost_time' : int (float (match .group (log_format .get ('cost_time_index' ))) * 1000 )})
145146
146147 # 计算PV、UV、平均请求数、GET/POST占比
147148 pv = len (times )
148149 uv = len (set (hosts ))
149- response_avg = int (pv / len (set (times )))
150- method_counts ['post_percentile' ] = int (method_counts ['post' ]* 100 / pv )
151- method_counts ['get_percentile' ] = int (method_counts ['get' ]* 100 / pv )
150+ response_avg = int (pv / len (set (times )))
151+ method_counts ['post_percentile' ] = int (method_counts ['post' ] * 100 / pv )
152+ method_counts ['get_percentile' ] = int (method_counts ['get' ] * 100 / pv )
152153
153154 # 获取每小时、每分钟、每秒的请求数量
154155 hours_counter = Counter (hours )
@@ -167,11 +168,12 @@ def parse_log_file(target_file, log_format):
167168 # 计算请求占比
168169 url_data_list = []
169170 for item in urls_most_common :
170- ratio = '%0.3f' % float (item [1 ]* 100 / float (pv ))
171- url_data_list .append (URLData (url = item [0 ], pv = item [1 ], ratio = ratio ))
171+ if item [1 ] >= config .urls_pv_threshold :
172+ ratio = '%0.3f' % float (item [1 ] * 100 / float (pv ))
173+ url_data_list .append (URLData (url = item [0 ], pv = item [1 ], ratio = ratio ))
172174
173175 # 第二次读取文件,以获取特定请求的访问时间及响应时间
174- with open ('../data/' + target_file , 'r' ) as f :
176+ with open ('../data/' + target_file , 'r' ) as f :
175177 for line in f :
176178 match = pattern .match (line )
177179 if match is None :
@@ -181,9 +183,8 @@ def parse_log_file(target_file, log_format):
181183 url = get_new_url (match .group (log_format .get ('url_index' )))
182184 else :
183185 url = match .group (log_format .get ('url_index' )).split ('?' )[0 ]
184- protocol = match .group (log_format .get ('protocol_index' ))
185186 for url_data in url_data_list :
186- if url_data .url == method + ' ' + url + ' ' + protocol :
187+ if url_data .url == ' ' . join ([ method , url ]) :
187188 url_data .time .append (match .group (log_format .get ('time_index' )))
188189 if 'cost_time_index' in log_format .keys ():
189190 url_data .cost .append (float (match .group (log_format .get ('cost_time_index' ))))
@@ -233,27 +234,27 @@ def parse_log_file(target_file, log_format):
233234 if cost_time_list :
234235 total_cost_time_pv = float (len (cost_time_list ))
235236 if cost_time_range ['r1' ]:
236- cost_time_range_percentile ['r1p' ] = '%0.3f' % float (cost_time_range ['r1' ]* 100 / total_cost_time_pv )
237+ cost_time_range_percentile ['r1p' ] = '%0.3f' % float (cost_time_range ['r1' ] * 100 / total_cost_time_pv )
237238 if cost_time_range ['r2' ]:
238- cost_time_range_percentile ['r2p' ] = '%0.3f' % float (cost_time_range ['r2' ]* 100 / total_cost_time_pv )
239+ cost_time_range_percentile ['r2p' ] = '%0.3f' % float (cost_time_range ['r2' ] * 100 / total_cost_time_pv )
239240 if cost_time_range ['r3' ]:
240- cost_time_range_percentile ['r3p' ] = '%0.3f' % float (cost_time_range ['r3' ]* 100 / total_cost_time_pv )
241+ cost_time_range_percentile ['r3p' ] = '%0.3f' % float (cost_time_range ['r3' ] * 100 / total_cost_time_pv )
241242 if cost_time_range ['r4' ]:
242- cost_time_range_percentile ['r4p' ] = '%0.3f' % float (cost_time_range ['r4' ]* 100 / total_cost_time_pv )
243+ cost_time_range_percentile ['r4p' ] = '%0.3f' % float (cost_time_range ['r4' ] * 100 / total_cost_time_pv )
243244 if cost_time_range ['r5' ]:
244- cost_time_range_percentile ['r5p' ] = '%0.3f' % float (cost_time_range ['r5' ]* 100 / total_cost_time_pv )
245+ cost_time_range_percentile ['r5p' ] = '%0.3f' % float (cost_time_range ['r5' ] * 100 / total_cost_time_pv )
245246 if cost_time_range ['r6' ]:
246- cost_time_range_percentile ['r6p' ] = '%0.3f' % float (cost_time_range ['r6' ]* 100 / total_cost_time_pv )
247+ cost_time_range_percentile ['r6p' ] = '%0.3f' % float (cost_time_range ['r6' ] * 100 / total_cost_time_pv )
247248 if cost_time_range ['r7' ]:
248- cost_time_range_percentile ['r7p' ] = '%0.3f' % float (cost_time_range ['r7' ]* 100 / total_cost_time_pv )
249+ cost_time_range_percentile ['r7p' ] = '%0.3f' % float (cost_time_range ['r7' ] * 100 / total_cost_time_pv )
249250 if cost_time_range ['r8' ]:
250- cost_time_range_percentile ['r8p' ] = '%0.3f' % float (cost_time_range ['r8' ]* 100 / total_cost_time_pv )
251+ cost_time_range_percentile ['r8p' ] = '%0.3f' % float (cost_time_range ['r8' ] * 100 / total_cost_time_pv )
251252 if cost_time_range ['r9' ]:
252- cost_time_range_percentile ['r9p' ] = '%0.3f' % float (cost_time_range ['r9' ]* 100 / total_cost_time_pv )
253+ cost_time_range_percentile ['r9p' ] = '%0.3f' % float (cost_time_range ['r9' ] * 100 / total_cost_time_pv )
253254 if cost_time_range ['r10' ]:
254- cost_time_range_percentile ['r10p' ] = '%0.3f' % float (cost_time_range ['r10' ]* 100 / total_cost_time_pv )
255+ cost_time_range_percentile ['r10p' ] = '%0.3f' % float (cost_time_range ['r10' ] * 100 / total_cost_time_pv )
255256 if cost_time_range ['r11' ]:
256- cost_time_range_percentile ['r11p' ] = '%0.3f' % float (cost_time_range ['r11' ]* 100 / total_cost_time_pv )
257+ cost_time_range_percentile ['r11p' ] = '%0.3f' % float (cost_time_range ['r11' ] * 100 / total_cost_time_pv )
257258
258259 total_data = {'pv' : pv , 'uv' : uv , 'response_avg' : response_avg , 'response_peak' : response_peak ,
259260 'response_peak_time' : response_peak_time , 'url_data_list' : url_data_list ,
@@ -282,22 +283,22 @@ def parse_log_file_with_goaccess(target_file):
282283
283284
284285def main ():
285-
286286 log_format = parse_log_format ()
287287
288288 result_files = [result_file .replace ('.html' , '' ) for result_file in get_dir_files ('../result/report/' )]
289289 target_files = sorted ([data_file for data_file in get_dir_files ('../data' ) if data_file not in result_files ])
290290
291291 for target_file in target_files :
292- print datetime .datetime .now (), ' Start parse file : ' + target_file
292+ print datetime .datetime .now (), ' Start parse file : ' + target_file
293293
294294 parse_log_file (target_file , log_format )
295295 if config .goaccess_flag :
296296 parse_log_file_with_goaccess (target_file )
297297
298- print datetime .datetime .now (), ' End parse file: ' + target_file
298+ print datetime .datetime .now (), ' End parse file: ' + target_file
299299
300300 update_index_html ()
301301
302+
302303if __name__ == '__main__' :
303304 main ()
0 commit comments