1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
| import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle
from sklearn.linear_model import LogisticRegressionCV
import re
from sklearn import metrics
from sklearn.model_selection import cross_val_score,cross_validate
#print(tfidf.fit_transform(bag).toarray())
#######################################################
def preprocessor(text):
text =re.sub('<[^>]*>', '', text)
emoticons = re.findall("(?::|;|=)(?:-)?(?:\)|\(|D|P)", text)
text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
return text
#######################################################
def tokenizer(text):
return text.split()
#######################################################
def tokenizer_stemmer(text):
return[porter.stem(word) for word in text.split()]
#######################################################
count = CountVectorizer()
docs = pd.read_csv("E:\DR\logiciel_as\IMDB Dataset.csv")
docs.isnull().values.any()
docs.shape
bag = count.fit_transform(docs)
#print(count.vocabulary_)
#print(bag.toarray())
np.set_printoptions(precision=2)
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
preprocessor(str(docs))
porter = PorterStemmer()
tokenizer(str(docs))
tokenizer_stemmer(str(docs))
tfidf = TfidfVectorizer(strip_accents=None,lowercase=True,preprocessor=preprocessor,tokenizer=tokenizer_stemmer,use_idf=True,norm='l2',smooth_idf=True)
y=docs.sentiment.values
X = tfidf.fit_transform(docs.review)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.33, shuffle=False)
clf = LogisticRegressionCV(scoring="accuracy",random_state=0,n_jobs=-1,verbose=3,max_iter=300)
clf.fit(X_train, y_train)
print('Accuracy:',clf.score(X_test,y_test))
print('Error rate:',1-clf.score(X_test,y_test))
precision=cross_val_score(clf, X_train, y_train,cv=5,scoring='precision')
print('Precision :', np.mean(precision), precision)
recall=cross_val_score(clf, X_train, y_train,cv=5,scoring='recall')
print('Recall:', np.mean(recall),recall)
f1=cross_val_score(clf, X_train, y_train,cv=5,scoring='f1')
print('F1:', np.mean(f1),f1) |
Partager