Options
Couleur de fond
Couleur de fond
Algorithmes et traitement de la donnée brute
""" Created on Sept 18 2016 @author: sebastien.parmentier@gmail.com """ print "START CLUSTERING" import sys import pandas as pd import numpy as np from scipy import stats reload(sys) sys.setdefaultencoding('utf-8') from optparse import OptionParser # Usage under command windows : python Segmentation.py -p parameters.csv parser = OptionParser() parser.add_option("-p", "--parameters", dest="path_parameters",action="store",default="parameters.csv") (options, args) = parser.parse_args() path_parameters=options.path_parameters ###################################################################################################################################################################### # LOAD PARAMETERS AND INPUT DATA ####################################################################################################################################################################### # Read CSV file into DataFrame with delimiter ; then set parameters (path for input and output files and number of clusters) into variables # \n is the windows line terminator ie each new line in the file ends with this terminator # To prevent warning message about mixed types, low_memory is set false. print "LOAD PARAMETERS" df_params=pd.read_csv(path_parameters,\ low_memory=False,\ sep=";", error_bad_lines=False, lineterminator='\n') path_customers=(df_params[df_params["Param name"]=="path_customers"]["Param value"].values[0]) path_clusters_results=(df_params[df_params["Param name"]=="path_clusters_results"]["Param value"].values[0]) path_clusters_summary=(df_params[df_params["Param name"]=="path_clusters_summary"]["Param value"].values[0]) nb_cluster=int(df_params[df_params["Param name"]=="nb_clusters"]["Param value"].values[0]) print "LOAD DATA" # Load the input customer file.csv at the path set in the parameter file df_customers=pd.read_csv(path_customers,\ low_memory=False,\ sep=";", error_bad_lines=False, lineterminator='\n') print "LOAD DATA CLEANSING" # Data cleansing by striping the whitespace and Co and format setup , to . for each column of the input file for column in df_customers.columns: new_name=column.strip("\r") df_customers.rename(columns={column: new_name}, inplace=True) if column in ["csat", "c2s", "clv"]: df_customers[new_name]=df_customers[new_name].apply(lambda x: np.float(str(x).replace(",","."))) df_customers[new_name]=df_customers[new_name].apply(lambda x:np.float(str(x).strip("\r"))) #################################################################################################################################################################### # NORMALIZE IF NEEDED ################################################################################################################################################################### print "NORMALIZE IF NEEDED" """ def normalize(v): norm=np.linalg.norm(v) if norm==0: return v return v/norm axe1_name="clv" axe2_name="csat" axe3_name="c2s" df_customers[axe1_name]=normalize(df_customers[axe1_name]) df_customers[axe2_name]=normalize(df_customers[axe2_name]) df_customers[axe3_name]=normalize(df_customers[axe3_name]) """ #################################################################################################################################################################### # COMPUTE KMEANS ANALYSIS ################################################################################################################################################################### print "COMPUTE KMEANS ANALYSIS WITH {} CLUSTERS".format(nb_cluster) axe1_name="clv" axe2_name="csat" axe3_name="c2s" from sklearn.cluster import KMeans # Compute from KPIs the k-means clustering then predict the closest cluster to each customer print "PREDICT THE CLOSEST CLUSTER FOR {} CUSTOMERS".format(len(np.unique(df_customers["customer_id"]))) df_kmeans=df_customers[[axe1_name, axe2_name,axe3_name]] clusters=KMeans(n_clusters=nb_cluster).fit_predict(df_kmeans) # The cluster number is from 1 (not 0) to nb_cluster # Set the is_copy flag to False which will turn off the check for that object df_kmeans.is_copy = False df_kmeans["cluster"]=clusters+1 df_kmeans["customer_id"]=np.array(df_customers["customer_id"]) if "behavior" in df_customers.columns: df_kmeans["behavior"]=np.array(df_customers["behavior"]) df_kmeans.to_csv(path_clusters_results,sep=";",index=False) #################################################################################################################################################################### # INFORMATION PER CLUSTER ################################################################################################################################################################### print "STATISTICS FOR EACH CLUSTER" # input("Appuyez sur la touche ENTREE pour continuer...") df_customers["clusters_"]=clusters if "behavior" in df_customers.columns: dico_cluster={} # KPI information (mean) per distinct cluster and distinct behavior unique_clusters=np.unique(clusters) unique_behaviors=np.unique(df_customers["behavior"]) for i in xrange(0, len(unique_clusters)): c=unique_clusters[i] x_temp=df_customers[df_customers["clusters_"]==c]["c2s"] y_temp=df_customers[df_customers["clusters_"]==c]["csat"] z_temp=df_customers[df_customers["clusters_"]==c]["clv"] df_temp=df_customers[df_customers["clusters_"]==c] c2s=np.mean(x_temp) csat=np.mean(y_temp) clv=np.mean(z_temp) res_c=[c2s,csat,clv] index=["c2s","csat","clv"] for behavior in unique_behaviors: res_c.append(df_temp[df_temp["behavior"]==behavior].shape[0]) index.append(str(behavior)) # dico_cluster[c]=(c2s,csat,clv,digital,traditional,hybrid,inactive) dico_cluster[c]=tuple(res_c) df_res=pd.DataFrame(dico_cluster, index=index) # Permute the axes according to the values given df_res=df_res.transpose() df_res["Cluster"]=(df_res.index)+1 df_res.to_csv(path_clusters_summary,sep=";",index=False) print "END CLUSTERING WITH BEHAVIOR" # Consider the case without any behavior segmentation else: dico_cluster={} unique_clusters=np.unique(clusters) for i in xrange(0, len(unique_clusters)): c=unique_clusters[i] x_temp=df_customers[df_customers["clusters_"]==c]["c2s"] y_temp=df_customers[df_customers["clusters_"]==c]["csat"] z_temp=df_customers[df_customers["clusters_"]==c]["clv"] df_temp=df_customers[df_customers["clusters_"]==c] c2s=np.mean(x_temp) csat=np.mean(y_temp) clv=np.mean(z_temp) dico_cluster[c]=(c2s,csat,clv) df_res=pd.DataFrame(dico_cluster, index=["c2s","csat","clv"]) df_res=df_res.transpose() df_res["Cluster"]=(df_res.index)+1 df_res.to_csv(path_clusters_summary,sep=";",index=False) print "END CLUSTERING WITHOUT BEHAVIOR"