import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import colors
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import calinski_harabasz_score
from scipy import stats


df = pd.read_csv('Pokemon.csv',keep_default_na=False)
df = df.drop_duplicates(subset=['number'])
df = df.reset_index(drop=True)
df


battle_cols=df.columns[5:11]
for col in battle_cols:
    df[col]=df[col]/df['total']
df


plt.figure(figsize=(15,10))
plt.xlabel("...", labelpad=20)
fig=sns.countplot(x='type1',data=df)
fig.set(xlabel='Primary Type',ylabel='Frequency',title="Distribution of Pokemon Types")
plt.show(fig)


cols=list(df.columns[5:11])
battle_stats=df[cols]
plt.figure(figsize=(15,10))
fig=sns.boxplot(data=battle_stats)
plt.show(fig)


tempdf=df # We'll need a copy of the original data set with outliers for later!
for col in cols:
    q25, q75 = np.percentile(df[col], 25), np.percentile(df[col], 75)
    iqr = q75 - q25
    cut_off = iqr * 1.5
    lower, upper = q25 - cut_off, q75 + cut_off
    df = df[df[col] < upper] 
    df = df[df[col] > lower]


cols=list(df.columns[5:11])
battle_stats=df[cols]
plt.figure(figsize=(15,10))
fig=sns.boxplot(data=battle_stats)
plt.show(fig)


battle_stats.describe()


sns.pairplot(battle_stats)

<seaborn.axisgrid.PairGrid at 0x1c4e0d9a320>


cov_mat=battle_stats.cov()
cov_mat


corr_mat = battle_stats.corr()
corr_mat


for col in cols:
    shapiro_test = stats.shapiro(battle_stats[col])
    print(shapiro_test)

ShapiroResult(statistic=0.978453516960144, pvalue=2.6408730757765397e-09)
ShapiroResult(statistic=0.9895996451377869, pvalue=2.5290804842370562e-05)
ShapiroResult(statistic=0.9668951034545898, pvalue=2.7778085821134058e-12)
ShapiroResult(statistic=0.9892361164093018, pvalue=1.7584263332537375e-05)
ShapiroResult(statistic=0.9807162880897522, pvalue=1.3024076039869215e-08)
ShapiroResult(statistic=0.9937264323234558, pvalue=0.00244623189792037)


pure_df=tempdf[tempdf['type2']=='']
pure_df = pure_df.reset_index(drop=True)
pure_df


plt.figure(figsize=(15,10))
plt.xlabel("...", labelpad=20)
fig=sns.countplot(x='type1',data=pure_df)
fig.set(xlabel='Primary Type',ylabel='Frequency',title="Distribution of Pure Pokemon Types")
plt.show(fig)


cols=list(pure_df.columns[5:11])
pure_stats=pure_df[cols]
plt.figure(figsize=(15,10))
fig=sns.boxplot(data=pure_stats)
plt.show(fig)


for col in cols:
    q25, q75 = np.percentile(pure_stats[col], 25), np.percentile(pure_stats[col], 75)
    iqr = q75 - q25
    cut_off = iqr * 1.5
    lower, upper = q25 - cut_off, q75 + cut_off
    pure_df = pure_df[pure_df[col] < upper] 
    pure_df = pure_df[pure_df[col] > lower]


pure_stats=pure_df[cols]
plt.figure(figsize=(15,10))
fig=sns.boxplot(data=pure_stats)
plt.show(fig)


pure_stats.describe()


sns.pairplot(pure_stats)

<seaborn.axisgrid.PairGrid at 0x1c4e5b58100>


pure_cov_mat = pure_stats.cov()
pure_cov_mat


pure_corr_mat = pure_stats.corr()
pure_corr_mat


for col in cols:
    shapiro_test = stats.shapiro(pure_stats[col])
    print(shapiro_test)

ShapiroResult(statistic=0.9741064310073853, pvalue=2.031649728451157e-06)
ShapiroResult(statistic=0.9903718829154968, pvalue=0.011999256908893585)
ShapiroResult(statistic=0.9700005054473877, pvalue=3.563248753835069e-07)
ShapiroResult(statistic=0.9924943447113037, pvalue=0.04792152717709541)
ShapiroResult(statistic=0.9727322459220886, pvalue=1.1170828884132789e-06)
ShapiroResult(statistic=0.9910305738449097, pvalue=0.018365293741226196)


X = pure_df.iloc[:, 5:11].values
y = pure_df.iloc[:, 2].values


scaler = StandardScaler()
X=scaler.fit_transform(X)
lda = LDA(n_components=None)
X_lda = lda.fit_transform(X, y)


lda_var_ratios = lda.explained_variance_ratio_


#This code to calculate how much explained variance we get per component is in public domain https://creativecommons.org/publicdomain/zero/1.0/
def select_n_components(var_ratio, goal_var: float) -> int:
    # Set initial variance explained so far
    total_variance = 0.0
    
    # Set initial number of features
    n_components = 0
    
    # For the explained variance of each feature:
    for explained_variance in var_ratio:
        
        # Add the explained variance to the total
        total_variance += explained_variance
        
        # Add one to the number of components
        n_components += 1
        
        # If we reach our goal level of explained variance
        if total_variance >= goal_var:
            # End the loop
            break
            
    # Return the number of components
    return n_components


n=select_n_components(lda_var_ratios, 0.95)
print("Optimal number of components:",n)

df

Optimal number of components: 4


lda = LDA(n_components=n)
X_lda = lda.fit_transform(X, y)


cross_validator = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)
scores = cross_val_score(lda, X, y, scoring='accuracy', cv=cross_validator, n_jobs=-1)
print(np.mean(scores))

0.28532432253362483


lda = LDA(n_components=n)
X_lda = lda.fit_transform(X, y)
color_list=["red","violet","blue","g","c","m","y","b","bisque","darkorange","lime","crimson","lightslategrey","saddlebrown","seashell","turquoise","khaki","darkolivegreen"]
color_dict={}
x=0
for type in pure_df["type1"].unique():
    color_dict[type]=color_list[x]
    x+=1
plt.xlabel('LD1')
plt.ylabel('LD2')
plt.scatter(
    X_lda[:,0],
    X_lda[:,1],
    c=pure_df["type1"].map(color_dict),
)

<matplotlib.collections.PathCollection at 0x1c4e3c0d840>


X = df.iloc[:, 5:11].values
y = df.iloc[:, 2].values
scaler = StandardScaler()
X=scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


pca = PCA(n_components=None)
X_pca = pca.fit_transform(X, y)
pca_var_ratios = pca.explained_variance_ratio_
n=select_n_components(pca_var_ratios, 0.95)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)
pca = PCA(n_components=n)


X_train = pca.fit_transform(X_train)
X_test = pca.fit_transform(X_test)
X_pca = pca.fit_transform(X, y)


classifier = RandomForestClassifier(max_depth=7)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)


conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)
print("Accuracy: " + str(accuracy_score(y_test, y_pred)))

[[0 0 0 0 0 1 0 0 0 3 0 0 1 0 1 0 0 5]
 [0 0 0 0 1 0 0 0 0 1 0 0 2 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0 0 0 1 1 0 2 0 0 0 0 7]
 [1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 2 0 0 1 0 0 0 0 1 0 0 2]
 [0 0 0 0 0 0 0 0 0 1 0 0 4 0 0 0 0 8]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [3 1 0 0 0 0 0 0 0 1 1 0 2 0 0 0 0 0]
 [1 0 1 1 0 0 0 0 0 1 0 0 7 0 0 0 0 8]
 [2 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 2]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3]
 [1 0 0 0 1 0 4 0 0 4 0 0 0 0 2 0 0 7]
 [1 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 1 0 0 0 0 2 0 3 0 0 0 0 4]
 [0 0 0 1 0 0 2 0 0 0 0 0 0 0 1 0 0 2]
 [0 0 0 2 0 0 0 0 0 0 0 0 3 0 1 0 0 0]
 [1 0 0 0 0 0 2 0 1 6 0 0 4 0 1 0 0 6]]
Accuracy: 0.05128205128205128


plt.xlabel('PC1')
plt.ylabel('PC2')
plt.scatter(
    X_pca[:,0],
    X_pca[:,1],
    c=df["type1"].map(color_dict)
)

<matplotlib.collections.PathCollection at 0x1c4e5f399c0>


X = df.iloc[:, 5:11].values
X_train, X_test = train_test_split(X,test_size=0.2)


scores = {}
scale = StandardScaler()
StdScale = scale.fit_transform(X)
max_clusters=20
for i in range(2,max_clusters+1):
    kmeans = KMeans(n_clusters=i)
    labels = kmeans.fit_predict(StdScale)
    db_index = calinski_harabasz_score(X, labels)
    scores.update({i: db_index})
n=max(scores, key=scores.get)


plt.plot(list(scores.keys()), list(scores.values()))
plt.xlabel("Number of clusters")
plt.ylabel("Calinski-Harabasz Index")
plt.annotate(f"Number of clusters = {n}", xy=(n, scores[n]), xytext=(n+1, scores[n]*.95), arrowprops=dict(arrowstyle="->"))
plt.xticks(range(2,22,2))
plt.show()


kmeans = KMeans(n_clusters = n, init="k-means++",random_state=1)
kmeans.fit(StdScale)
df["cluster"] = kmeans.labels_


sns.pairplot(df[['hp', 'attack', 'defense','sp_attack', 'sp_defense', 'speed', "cluster"]], hue = "cluster",palette="flare",)

<seaborn.axisgrid.PairGrid at 0x1c4e3bc2b00>


X = df.iloc[:, 5:11].values
y = df.iloc[:, 12].values
scaler = StandardScaler()
X=scaler.fit_transform(X)
pca = PCA(n_components=n)
X_pca = pca.fit_transform(X, y)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.scatter(
    X_pca[:,0],
    X_pca[:,1],
    c=df["cluster"],
    cmap="flare"
)

<matplotlib.collections.PathCollection at 0x1c4f516c5e0>


cluster_box_plots = pd.melt(df, id_vars = [
    "number", 
    "name", 
    "type1", 
    "type2", 
    "generation",
    "cluster"
    ], value_vars = [
    'hp', 'attack', 'defense','sp_attack', 'sp_defense', 'speed'   
])
plt.figure(figsize=(12,5))
ax = sns.boxplot(x="variable", y="value", hue = "cluster", data=cluster_box_plots,palette="flare")
plt.title("Battle Stats Boxplots by Cluster")
plt.xlabel("Skills")
plt.ylabel("Proportion")

Text(0, 0.5, 'Proportion')


col_means=df.groupby(['cluster']).mean()
col_means=col_means[cols]


col_means


colors = sns.color_palette('pastel')[0:6]
for index, row in col_means.iterrows():
    plt.pie(row, labels = cols, colors = colors, autopct='%.0f%%')
    plt.show()

	number	name	type1	type2	total	hp	attack	defense	sp_attack	sp_defense	speed	generation	legendary
0	1	Bulbasaur	Grass	Poison	318	0.141509	0.154088	0.154088	0.204403	0.204403	0.141509	1	False
1	2	Ivysaur	Grass	Poison	405	0.148148	0.153086	0.155556	0.197531	0.197531	0.148148	1	False
2	3	Venusaur	Grass	Poison	525	0.152381	0.156190	0.158095	0.190476	0.190476	0.152381	1	False
3	4	Charmander	Fire		309	0.126214	0.168285	0.139159	0.194175	0.161812	0.210356	1	False
4	5	Charmeleon	Fire		405	0.143210	0.158025	0.143210	0.197531	0.160494	0.197531	1	False
...	...	...	...	...	...	...	...	...	...	...	...	...	...
893	894	Regieleki	Electric		580	0.137931	0.172414	0.086207	0.172414	0.086207	0.344828	8	True
894	895	Regidrago	Dragon		580	0.344828	0.172414	0.086207	0.172414	0.086207	0.137931	8	True
895	896	Glastrier	Ice		580	0.172414	0.250000	0.224138	0.112069	0.189655	0.051724	8	True
896	897	Spectrier	Ghost		580	0.172414	0.112069	0.103448	0.250000	0.137931	0.224138	8	True
897	898	Calyrex	Psychic	Grass	500	0.200000	0.160000	0.160000	0.160000	0.160000	0.160000	8	True

	hp	attack	defense	sp_attack	sp_defense	speed
count	779.000000	779.000000	779.000000	779.000000	779.000000	779.000000
mean	0.159656	0.182843	0.163788	0.167095	0.163029	0.163680
std	0.029690	0.044405	0.039927	0.044262	0.036166	0.053418
min	0.083333	0.075758	0.049180	0.047619	0.064912	0.034483
25%	0.140064	0.148886	0.134237	0.132395	0.137169	0.124307
50%	0.156250	0.180000	0.157182	0.166667	0.160000	0.161290
75%	0.176777	0.216667	0.185567	0.200000	0.187016	0.202532
max	0.240000	0.295082	0.280528	0.300000	0.264151	0.314815

	hp	attack	defense	sp_attack	sp_defense	speed
hp	0.000882	0.000070	0.000025	-0.000313	-0.000139	-0.000525
attack	0.000070	0.001972	0.000042	-0.000878	-0.000860	-0.000339
defense	0.000025	0.000042	0.001594	-0.000672	0.000159	-0.001149
sp_attack	-0.000313	-0.000878	-0.000672	0.001959	0.000139	-0.000235
sp_defense	-0.000139	-0.000860	0.000159	0.000139	0.001308	-0.000605
speed	-0.000525	-0.000339	-0.001149	-0.000235	-0.000605	0.002854

	hp	attack	defense	sp_attack	sp_defense	speed
hp	1.000000	0.052878	0.021172	-0.237989	-0.129617	-0.331319
attack	0.052878	1.000000	0.023764	-0.446826	-0.535569	-0.142778
defense	0.021172	0.023764	1.000000	-0.380275	0.109813	-0.538535
sp_attack	-0.237989	-0.446826	-0.380275	1.000000	0.086744	-0.099480
sp_defense	-0.129617	-0.535569	0.109813	0.086744	1.000000	-0.313125
speed	-0.331319	-0.142778	-0.538535	-0.099480	-0.313125	1.000000

	number	name	type1	type2	total	hp	attack	defense	sp_attack	sp_defense	speed	generation	legendary
0	4	Charmander	Fire		309	0.126214	0.168285	0.139159	0.194175	0.161812	0.210356	1	False
1	5	Charmeleon	Fire		405	0.143210	0.158025	0.143210	0.197531	0.160494	0.197531	1	False
2	7	Squirtle	Water		314	0.140127	0.152866	0.207006	0.159236	0.203822	0.136943	1	False
3	8	Wartortle	Water		405	0.145679	0.155556	0.197531	0.160494	0.197531	0.143210	1	False
4	9	Blastoise	Water		530	0.149057	0.156604	0.188679	0.160377	0.198113	0.147170	1	False
...	...	...	...	...	...	...	...	...	...	...	...	...	...
451	891	Kubfu	Fighting		385	0.155844	0.233766	0.155844	0.137662	0.129870	0.187013	8	True
452	894	Regieleki	Electric		580	0.137931	0.172414	0.086207	0.172414	0.086207	0.344828	8	True
453	895	Regidrago	Dragon		580	0.344828	0.172414	0.086207	0.172414	0.086207	0.137931	8	True
454	896	Glastrier	Ice		580	0.172414	0.250000	0.224138	0.112069	0.189655	0.051724	8	True
455	897	Spectrier	Ghost		580	0.172414	0.112069	0.103448	0.250000	0.137931	0.224138	8	True

Pokémon "Hidden" Archetypes¶

An Analysis on Pokémon using Machine Learning Techniques¶

The Data Set¶

Pure-Pokémon Conundrum¶

Analysis¶

Linear Discriminant Analysis (LDA)¶

Principal Component Analysis (PCA)¶

Unsupervised Learning¶

Clustering with K-Means¶

Final Thoughts¶

	hp	attack	defense	sp_attack	sp_defense	speed
count	389.000000	389.000000	389.000000	389.000000	389.000000	389.000000
mean	0.159900	0.183619	0.161868	0.164946	0.163574	0.166093
std	0.030002	0.044587	0.036445	0.043023	0.035755	0.053912
min	0.083333	0.075758	0.073171	0.047619	0.083333	0.034483
25%	0.140625	0.151976	0.134146	0.133333	0.137931	0.127389
50%	0.157895	0.181416	0.157534	0.163265	0.160377	0.164706
75%	0.176056	0.214433	0.181598	0.197222	0.186335	0.205882
max	0.240964	0.291667	0.269231	0.284314	0.265306	0.310345

	hp	attack	defense	sp_attack	sp_defense	speed
hp	0.000900	0.000070	0.000114	-0.000330	-0.000109	-0.000644
attack	0.000070	0.001988	0.000061	-0.000863	-0.000859	-0.000397
defense	0.000114	0.000061	0.001328	-0.000533	0.000090	-0.001059
sp_attack	-0.000330	-0.000863	-0.000533	0.001851	0.000140	-0.000265
sp_defense	-0.000109	-0.000859	0.000090	0.000140	0.001278	-0.000541
speed	-0.000644	-0.000397	-0.001059	-0.000265	-0.000541	0.002906

	hp	attack	defense	sp_attack	sp_defense	speed
hp	1.000000	0.052408	0.103809	-0.255965	-0.101523	-0.398419
attack	0.052408	1.000000	0.037550	-0.450034	-0.538716	-0.165166
defense	0.103809	0.037550	1.000000	-0.339854	0.068756	-0.539215
sp_attack	-0.255965	-0.450034	-0.339854	1.000000	0.091296	-0.114191
sp_defense	-0.101523	-0.538716	0.068756	0.091296	1.000000	-0.280514
speed	-0.398419	-0.165166	-0.539215	-0.114191	-0.280514	1.000000

	hp	attack	defense	sp_attack	sp_defense	speed
cluster
0	0.160290	0.148680	0.174808	0.183738	0.196262	0.136221
1	0.171538	0.221638	0.183640	0.132283	0.147277	0.143912
2	0.147464	0.181961	0.132717	0.182942	0.142590	0.212327