1.对比决策树与随机森林的性能,参考6.4节暴力破解源码,对其进行部分修改
(1)在6.4节基础修改,决策树复用6.4节内容
clf1 = tree.DecisionTreeClassifier()
score = model_selection.cross_val_score(clf1, x, y, n_jobs=1, cv=10)
print(np.mean(score))
(2)增加随机森林代码逻辑,如下所示:
clf2 = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0)
score = model_selection.cross_val_score(clf2, x, y, n_jobs=1, cv=10)
print(np.mean(score))
需要注意的是,这里讲n_jobs值均改为1
2.完整源码如下所示:
# -*- coding:utf-8 -*-
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import model_selection
import os
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import numpy as np
def load_one_flle(filename):
x=[]
with open(filename) as f:
line=f.readline()
line=line.strip('\n')
return line
def load_adfa_training_files(rootdir):
x=[]
y=[]
list = os.listdir(rootdir)
for i in range(0, len(list)):
path = os.path.join(rootdir, list[i])
if os.path.isfile(path):
x.append(load_one_flle(path))
y.append(0)
return x,y
def dirlist(path, allfile):
filelist = os.listdir(path)
for filename in filelist:
filepath = os.path.join(path, filename)
if os.path.isdir(filepath):
dirlist(filepath, allfile)
else:
allfile.append(filepath)
return allfile
file_prefix = r"../data/ADFA-LD/Attack_Data_Master/Hydra_FTP_"
def load_adfa_hydra_ftp_files(rootdir):
x=[]
y=[]
allfile=dirlist(rootdir,[])
for file in allfile:
if re.match(file_prefix,file):
x.append(load_one_flle(file))
y.append(1)
return x,y
if __name__ == '__main__':
x1,y1=load_adfa_training_files("../data/ADFA-LD/Training_Data_Master/")
x2,y2=load_adfa_hydra_ftp_files("../data/ADFA-LD/Attack_Data_Master/")
x=x1+x2
y=y1+y2
vectorizer = CountVectorizer(min_df=1)
x=vectorizer.fit_transform(x)
x=x.toarray()
clf1 = tree.DecisionTreeClassifier()
score = model_selection.cross_val_score(clf1, x, y, n_jobs=1, cv=10)
print(score)
print(np.mean(score))
clf2 = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0)
score = model_selection.cross_val_score(clf2, x, y, n_jobs=1, cv=10)
print(score)
print(np.mean(score))
3.运行结果
[1. 0.97029703 0.95 0.96969697 0.96969697 0.90909091
0.98989899 0.97979798 0.95959596 0.93939394]
0.9637468746874689
[1. 0.99009901 0.98 0.97979798 0.97979798 0.93939394
0.98989899 0.98989899 1. 1. ]
0.9848886888688868
本文为互联网自动采集或经作者授权后发布,本文观点不代表立场,若侵权下架请联系我们删帖处理!文章出自:https://blog.csdn.net/mooyuan/article/details/122755819