#!/usr/bin/env python # coding: utf-8 # In[143]: # -*- coding: utf-8 -*- import pandas as pd import numpy as np import unicodedata get_ipython().run_line_magic('matplotlib', 'inline') import pylab as plt pd.set_option('display.max_columns', 100) pd.set_option('display.max_columns', 200) from stop_words import get_stop_words pd.options.display.encoding = 'ascii' # # Load Data # In[2]: train = pd.read_csv('data/train.csv', index_col=0) test = pd.read_csv('data/test.csv', index_col=0) # In[3]: train.fillna('', inplace=True) test.fillna('', inplace=True) # In[4]: train.head() # In[5]: train.shape, test.shape # In[59]: train['Categorie3'].value_counts().count() # In[60]: test['Categorie3'].value_counts().count() # In[46]: df_full = train.append(other=test) # In[51]: df_full['Categorie3'].value_counts().count() # In[56]: df_full['Categorie3_Name'].value_counts().count() # In[6]: from sklearn.metrics import accuracy_score from sklearn.linear_model import SGDClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer Le naive bayes multinomial classifieur est approprié pour la classification avec des features discretes (comptage des mots pour la classification de texte). La distribution multinomiale nécessite habituellement des nombres entiers. Cependant, en pratique on peut également faire des counts tel que tf-idf. # In[140]: model = MultinomialNB() # In[227]: model2 = SGDClassifier( loss='hinge', n_iter=400, n_jobs=-1) # # Un premier modèle bag of words # In[8]: vectorizer = CountVectorizer() # In[9]: X_train = vectorizer.fit_transform(train.Libelle) X_test = vectorizer.transform(test.Libelle) # In[10]: len(X_train.data) # In[11]: model.fit(X_train, train.Categorie3) # In[12]: print('Accuracy : %.4f'%accuracy_score(test.Categorie3, model.predict(X_test))) # # Autres champs # In[13]: train['texte'] = train['Description'] + ' ' + train['Libelle'] + ' ' + train['Marque'] test['texte'] = test['Description'] + ' ' + test['Libelle'] + ' ' + test['Marque'] # In[14]: vectorizer = CountVectorizer() # In[15]: X_train = vectorizer.fit_transform(train.texte) X_test = vectorizer.transform(test.texte) # In[16]: model.fit(X_train, train.Categorie3) # In[198]: model.fit(X_train, train.Categorie3) # In[17]: print('Accuracy : %.4f'%accuracy_score(test.Categorie3, model.predict(X_test))) # # TF/IDF # In[18]: from sklearn.feature_extraction.text import TfidfVectorizer # In[19]: vectorizer = TfidfVectorizer() # In[20]: X_train = vectorizer.fit_transform(train.texte) X_test = vectorizer.transform(test.texte) # In[21]: model.fit(X_train, train.Categorie3) # In[22]: print('Accuracy : %.4f'%accuracy_score(test.Categorie3, model.predict(X_test))) # # Bigrams # In[23]: vectorizer = TfidfVectorizer(ngram_range=(1,2)) # In[24]: X_train = vectorizer.fit_transform(train.texte) X_test = vectorizer.transform(test.texte) # In[25]: model.fit(X_train, train.Categorie3) # In[26]: print('Accuracy : %.4f'%accuracy_score(test.Categorie3, model.predict(X_test))) # # Bag of characters (1) # In[27]: vectorizer = CountVectorizer(analyzer=u'char_wb') # In[28]: X_train = vectorizer.fit_transform(train.texte) X_test = vectorizer.transform(test.texte) # In[29]: model.fit(X_train, train.Categorie3) # In[30]: print('Accuracy : %.4f'%accuracy_score(test.Categorie3, model.predict(X_test))) # # Bag of characters (3) # In[31]: vectorizer = CountVectorizer(analyzer=u'char_wb', ngram_range=(3,3)) # In[32]: X_train = vectorizer.fit_transform(train.texte) X_test = vectorizer.transform(test.texte) # In[33]: model.fit(X_train, train.Categorie3) # In[34]: print('Accuracy : %.4f'%accuracy_score(test.Categorie3, model.predict(X_test))) # # Bag of characters (5) # In[35]: vectorizer = CountVectorizer(analyzer=u'char_wb', ngram_range=(5,5)) # In[36]: X_train = vectorizer.fit_transform(train.texte) X_test = vectorizer.transform(test.texte) # In[37]: model.fit(X_train, train.Categorie3) # In[38]: print('Accuracy : %.4f'%accuracy_score(test.Categorie3, model.predict(X_test))) # # Bag of characters (6) # In[186]: vectorizer = CountVectorizer(analyzer=u'char', ngram_range=(5,5)) # In[187]: X_train = vectorizer.fit_transform(train.texte) X_test = vectorizer.transform(test.texte) # In[191]: model.fit(X_train, train.Categorie3) # In[192]: print('Accuracy : %.4f'%accuracy_score(test.Categorie3, model.predict(X_test))) # # Bag of characters (7) # In[193]: vectorizer = CountVectorizer(analyzer=u'word', ngram_range=(5,5)) # In[194]: X_train = vectorizer.fit_transform(train.texte) X_test = vectorizer.transform(test.texte) # In[195]: model.fit(X_train, train.Categorie3) # In[196]: print('Accuracy : %.4f'%accuracy_score(test.Categorie3, model.predict(X_test))) # # Bag of characters (8) # In[216]: vectorizer = CountVectorizer(analyzer=u'char', ngram_range=(5,5)) # In[217]: X_train = vectorizer.fit_transform(train.texte) X_test = vectorizer.transform(test.texte) # In[ ]: model2.fit(X_train, train.Categorie3) # In[ ]: print('Accuracy : %.4f'%accuracy_score(test.Categorie3, model2.predict(X_test))) # # TF/IDF(2) # In[182]: vectorizer = TfidfVectorizer(min_df =0, max_df = 0.5, stop_words=get_stop_words('fr')) # In[183]: X_train = vectorizer.fit_transform(train.texte) X_test = vectorizer.transform(test.texte) # In[184]: model.fit(X_train, train.Categorie3) # In[185]: print('Accuracy : %.4f'%accuracy_score(test.Categorie3, model.predict(X_test)))