我们给大家带来了关于学习python中scikit-learn机器代码的相关具体实例,以下就是全部代码内容:
# -*- coding: utf-8 -*-
import numpy
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn import linear_model
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn import cross_validation
from sklearn import preprocessing
#import iris_data
def load_data():
iris = load_iris()
x, y = iris.data, iris.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
return x_train,y_train,x_test,y_test
def train_clf3(train_data, train_tags):
clf = LinearSVC(C=1100.0)#default with 'rbf'
clf.fit(train_data,train_tags)
return clf
def train_clf(train_data, train_tags):
clf = MultinomialNB(alpha=0.01)
print numpy.asarray(train_tags)
clf.fit(train_data, numpy.asarray(train_tags))
return clf
def evaluate(actual, pred):
m_precision = metrics.precision_score(actual, pred)
m_recall = metrics.recall_score(actual, pred)
print 'precision:{0:.3f}'.format(m_precision)
print 'recall:{0:0.3f}'.format(m_recall)
print 'f1-score:{0:.8f}'.format(metrics.f1_score(actual,pred));
x_train,y_train,x_test,y_test = load_data()
clf = train_clf(x_train, y_train)
pred = clf.predict(x_test)
evaluate(numpy.asarray(y_test), pred)
print metrics.classification_report(y_test, pred)
使用自定义数据
# coding: utf-8
import numpy
from sklearn import metrics
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
import codecs
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
from sklearn import linear_model
train_corpus = [
'我们 我们 好孩子 认证 。 就是',
'我们 好孩子 认证 。 中国',
'我们 好孩子 认证 。 孤独',
'我们 好孩子 认证 。',
]
test_corpus = [
'我 菲律宾 韩国',
'我们 好孩子 认证 。 中国',
]
def input_data(train_file, test_file):
train_words = []
train_tags = []
test_words = []
test_tags = []
f1 = codecs.open(train_file,'r','utf-8','ignore')
for line in f1:
tks = line.split(':', 1)
word_list = tks[1]
word_array = word_list[1:(len(word_list)-3)].split(", ")
train_words.append(" ".join(word_array))
train_tags.append(tks[0])
f2 = codecs.open(test_file,'r','utf-8','ignore')
for line in f2:
tks = line.split(':', 1)
word_list = tks[1]
word_array = word_list[1:(len(word_list)-3)].split(", ")
test_words.append(" ".join(word_array))
test_tags.append(tks[0])
return train_words, train_tags, test_words, test_tags
def vectorize(train_words, test_words):
#v = HashingVectorizer(n_features=25000, non_negative=True)
v = HashingVectorizer(non_negative=True)
#v = CountVectorizer(min_df=1)
train_data = v.fit_transform(train_words)
test_data = v.fit_transform(test_words)
return train_data, test_data
def vectorize1(train_words, test_words):
tv = TfidfVectorizer(sublinear_tf = False,use_idf=True);
train_data = tv.fit_transform(train_words);
tv2 = TfidfVectorizer(vocabulary = tv.vocabulary_);
test_data = tv2.fit_transform(test_words);
return train_data, test_data
def vectorize2(train_words, test_words):
count_v1= CountVectorizer(stop_words = 'english', max_df = 0.5);
counts_train = count_v1.fit_transform(train_words);
count_v2 = CountVectorizer(vocabulary=count_v1.vocabulary_);
counts_test = count_v2.fit_transform(test_words);
tfidftransformer = TfidfTransformer();
train_data = tfidftransformer.fit(counts_train).transform(counts_train);
test_data = tfidftransformer.fit(counts_test).transform(counts_test);
return train_data, test_data
def evaluate(actual, pred):
m_precision = metrics.precision_score(actual, pred)
m_recall = metrics.recall_score(actual, pred)
print 'precision:{0:.3f}'.format(m_precision)
print 'recall:{0:0.3f}'.format(m_recall)
print 'f1-score:{0:.8f}'.format(metrics.f1_score(actual,pred));
def train_clf(train_data, train_tags):
clf = MultinomialNB(alpha=0.01)
clf.fit(train_data, numpy.asarray(train_tags))
return clf
def train_clf1(train_data, train_tags):
#KNN Classifier
clf = KNeighborsClassifier()#default with k=5
clf.fit(train_data, numpy.asarray(train_tags))
return clf
def train_clf2(train_data, train_tags):
clf = linear_model.LogisticRegression(C=1e5)
clf.fit(train_data,train_tags)
return clf
def train_clf3(train_data, train_tags):
clf = LinearSVC(C=1100.0)#default with 'rbf'
clf.fit(train_data,train_tags)
return clf
def train_clf4(train_data, train_tags):
"""
随机森林,不可使用稀疏矩阵
"""
clf = RandomForestClassifier(n_estimators=10)
clf.fit(train_data.todense(),train_tags)
return clf
#使用codecs逐行读取
def codecs_read_label_line(filename):
label_list=[]
f = codecs.open(filename,'r','utf-8','ignore')
line = f.readline()
while line:
#label_list.append(line[0:len(line)-2])
label_list.append(line[0:len(line)-1])
line = f.readline()
f.close()
return label_list
def save_test_features(test_url, test_label):
test_feature_list = codecs_read_label_line('test.dat')
fw = open('test_labeded.dat',"w+")
for (url,label) in zip(test_feature_list,test_label):
fw.write(url+'\t'+label)
fw.write('\n')
fw.close()
def main():
train_file = u'..\\file\\py_train.txt'
test_file = u'..\\file\\py_test.txt'
train_words, train_tags, test_words, test_tags = input_data(train_file, test_file)
#print len(train_words), len(train_tags), len(test_words), len(test_words),
train_data, test_data = vectorize1(train_words, test_words)
print type(train_data)
print train_data.shape
print test_data.shape
print test_data[0].shape
print numpy.asarray(test_data[0])
clf = train_clf3(train_data, train_tags)
scores = cross_validation.cross_val_score(
clf, train_data, train_tags, cv=5, scoring="f1_weighted")
print scores
#predicted = cross_validation.cross_val_predict(clf, train_data,train_tags, cv=5)
'''
'''
pred = clf.predict(test_data)
error_list=[]
for (true_tag,predict_tag) in zip(test_tags,pred):
if true_tag != predict_tag:
print true_tag,predict_tag
error_list.append(true_tag+' '+predict_tag)
print len(error_list)
evaluate(numpy.asarray(test_tags), pred)
'''
#输出打标签结果
test_feature_list = codecs_read_label_line('test.dat')
save_test_features(test_feature_list, pred)
'''
if __name__ == '__main__':
main()
免责声明:本站文章均来自网站采集或用户投稿,网站不提供任何软件下载或自行开发的软件!
如有用户或公司发现本站内容信息存在侵权行为,请邮件告知! 858582#qq.com
暂无“python中scikit-learn机器代码实例”评论...
RTX 5090要首发 性能要翻倍!三星展示GDDR7显存
三星在GTC上展示了专为下一代游戏GPU设计的GDDR7内存。
首次推出的GDDR7内存模块密度为16GB,每个模块容量为2GB。其速度预设为32 Gbps(PAM3),但也可以降至28 Gbps,以提高产量和初始阶段的整体性能和成本效益。
据三星表示,GDDR7内存的能效将提高20%,同时工作电压仅为1.1V,低于标准的1.2V。通过采用更新的封装材料和优化的电路设计,使得在高速运行时的发热量降低,GDDR7的热阻比GDDR6降低了70%。
更新动态
2025年11月08日
2025年11月08日
- 小骆驼-《草原狼2(蓝光CD)》[原抓WAV+CUE]
- 群星《欢迎来到我身边 电影原声专辑》[320K/MP3][105.02MB]
- 群星《欢迎来到我身边 电影原声专辑》[FLAC/分轨][480.9MB]
- 雷婷《梦里蓝天HQⅡ》 2023头版限量编号低速原抓[WAV+CUE][463M]
- 群星《2024好听新歌42》AI调整音效【WAV分轨】
- 王思雨-《思念陪着鸿雁飞》WAV
- 王思雨《喜马拉雅HQ》头版限量编号[WAV+CUE]
- 李健《无时无刻》[WAV+CUE][590M]
- 陈奕迅《酝酿》[WAV分轨][502M]
- 卓依婷《化蝶》2CD[WAV+CUE][1.1G]
- 群星《吉他王(黑胶CD)》[WAV+CUE]
- 齐秦《穿乐(穿越)》[WAV+CUE]
- 发烧珍品《数位CD音响测试-动向效果(九)》【WAV+CUE】
- 邝美云《邝美云精装歌集》[DSF][1.6G]
- 吕方《爱一回伤一回》[WAV+CUE][454M]