1 | { |
---|
2 | "cells": [ |
---|
3 | { |
---|
4 | "cell_type": "markdown", |
---|
5 | "metadata": {}, |
---|
6 | "source": [ |
---|
7 | "# Clustering Algorithms\n", |
---|
8 | "Stats 170A, Winter 2018\n", |
---|
9 | "\n", |
---|
10 | "Illustration of clustering algorithms" |
---|
11 | ] |
---|
12 | }, |
---|
13 | { |
---|
14 | "cell_type": "code", |
---|
15 | "execution_count": null, |
---|
16 | "metadata": { |
---|
17 | "collapsed": true |
---|
18 | }, |
---|
19 | "outputs": [], |
---|
20 | "source": [ |
---|
21 | "# First, we'll import pandas and numpy\n", |
---|
22 | "import pandas as pd\n", |
---|
23 | "import numpy as np\n", |
---|
24 | "\n", |
---|
25 | "# import relevant parts of sklearn\n", |
---|
26 | "from sklearn import cluster, datasets\n", |
---|
27 | "\n", |
---|
28 | "# We'll also import seaborn, a Python graphing library\n", |
---|
29 | "# (you may need to run >conda install seaborn (if using Anaconda)) \n", |
---|
30 | "import seaborn as sns\n", |
---|
31 | "\n", |
---|
32 | "# and matplotlib\n", |
---|
33 | "import matplotlib.pyplot as plt\n", |
---|
34 | "sns.set(style=\"white\", color_codes=True)\n", |
---|
35 | "%matplotlib inline\n", |
---|
36 | "\n", |
---|
37 | "# and turn off annoying warnings...(if we were writing \"real code\" we shouldn't do this)\n", |
---|
38 | "import warnings \n", |
---|
39 | "warnings.simplefilter('ignore')" |
---|
40 | ] |
---|
41 | }, |
---|
42 | { |
---|
43 | "cell_type": "markdown", |
---|
44 | "metadata": {}, |
---|
45 | "source": [ |
---|
46 | "## Load and Explore the Iris Data Set" |
---|
47 | ] |
---|
48 | }, |
---|
49 | { |
---|
50 | "cell_type": "code", |
---|
51 | "execution_count": null, |
---|
52 | "metadata": {}, |
---|
53 | "outputs": [], |
---|
54 | "source": [ |
---|
55 | "iris = pd.read_csv('iris.csv')\n", |
---|
56 | "iris.head() " |
---|
57 | ] |
---|
58 | }, |
---|
59 | { |
---|
60 | "cell_type": "markdown", |
---|
61 | "metadata": {}, |
---|
62 | "source": [ |
---|
63 | "## Use Kmeans to Cluster the Iris Data" |
---|
64 | ] |
---|
65 | }, |
---|
66 | { |
---|
67 | "cell_type": "code", |
---|
68 | "execution_count": null, |
---|
69 | "metadata": {}, |
---|
70 | "outputs": [], |
---|
71 | "source": [ |
---|
72 | "cols = iris.columns[0:4] # pull out the real-valued columns for clustering, ignore species column\n", |
---|
73 | "X = iris[cols] \n", |
---|
74 | "kmeans = cluster.KMeans(n_clusters=3)\n", |
---|
75 | "clabels = kmeans.fit_predict(X) \n", |
---|
76 | "iris['kmeans_label'] = clabels \n", |
---|
77 | "iris" |
---|
78 | ] |
---|
79 | }, |
---|
80 | { |
---|
81 | "cell_type": "markdown", |
---|
82 | "metadata": {}, |
---|
83 | "source": [ |
---|
84 | "## Visually compare K-Means Clustering with True Species Labels" |
---|
85 | ] |
---|
86 | }, |
---|
87 | { |
---|
88 | "cell_type": "code", |
---|
89 | "execution_count": null, |
---|
90 | "metadata": {}, |
---|
91 | "outputs": [], |
---|
92 | "source": [ |
---|
93 | "sns.set(rc={'axes.facecolor':'lightslategray'})\n", |
---|
94 | "sns.pairplot(x_vars=['sepal_length'], y_vars=['sepal_width'], data=iris, hue=\"kmeans_label\", size=4) \n", |
---|
95 | "sns.pairplot(x_vars=['sepal_length'], y_vars=['sepal_width'], data=iris, hue=\"species\", size=4)" |
---|
96 | ] |
---|
97 | }, |
---|
98 | { |
---|
99 | "cell_type": "markdown", |
---|
100 | "metadata": {}, |
---|
101 | "source": [ |
---|
102 | "## 3d Plot of Iris Data with KMeans Cluster Labels" |
---|
103 | ] |
---|
104 | }, |
---|
105 | { |
---|
106 | "cell_type": "code", |
---|
107 | "execution_count": null, |
---|
108 | "metadata": {}, |
---|
109 | "outputs": [], |
---|
110 | "source": [ |
---|
111 | "from mpl_toolkits.mplot3d import Axes3D\n", |
---|
112 | "%matplotlib \n", |
---|
113 | "fig = plt.figure()\n", |
---|
114 | "ax = fig.add_subplot(111, projection='3d')\n", |
---|
115 | "ax.scatter(iris['sepal_length'], iris['sepal_width'], iris['petal_length'], c=iris.kmeans_label, s=60)\n", |
---|
116 | "ax.view_init(30, 185)\n", |
---|
117 | "ax.set_xlabel('Sepal Length')\n", |
---|
118 | "ax.set_ylabel('Sepal Width')\n", |
---|
119 | "ax.set_zlabel('Petal Length')\n", |
---|
120 | "plt.show()" |
---|
121 | ] |
---|
122 | }, |
---|
123 | { |
---|
124 | "cell_type": "markdown", |
---|
125 | "metadata": {}, |
---|
126 | "source": [ |
---|
127 | "## Plot the Kmeans Centroids as 4-Dimensional Vectors" |
---|
128 | ] |
---|
129 | }, |
---|
130 | { |
---|
131 | "cell_type": "code", |
---|
132 | "execution_count": null, |
---|
133 | "metadata": {}, |
---|
134 | "outputs": [], |
---|
135 | "source": [ |
---|
136 | "%matplotlib inline\n", |
---|
137 | "centroids = kmeans.cluster_centers_ \n", |
---|
138 | "sns.set_style(\"whitegrid\")\n", |
---|
139 | "plt.plot(centroids.transpose())\n", |
---|
140 | "ax.set_xlabel('Indices of Variables')\n", |
---|
141 | "ax.set_ylabel('Values of Variables')\n", |
---|
142 | "plt.show()\n", |
---|
143 | "centroids" |
---|
144 | ] |
---|
145 | }, |
---|
146 | { |
---|
147 | "cell_type": "markdown", |
---|
148 | "metadata": {}, |
---|
149 | "source": [ |
---|
150 | "## Use Hierarchical (Agglomerative) Clustering to Cluster the Iris Data \n", |
---|
151 | "... and we will compare to the Kmeans clustering" |
---|
152 | ] |
---|
153 | }, |
---|
154 | { |
---|
155 | "cell_type": "code", |
---|
156 | "execution_count": null, |
---|
157 | "metadata": {}, |
---|
158 | "outputs": [], |
---|
159 | "source": [ |
---|
160 | "hlabels = cluster.AgglomerativeClustering(n_clusters=3, linkage='ward').fit_predict(X) \n", |
---|
161 | "iris['hier_label'] = hlabels \n", |
---|
162 | "print(hlabels[::10])\n", |
---|
163 | "print(clabels[::10])\n", |
---|
164 | "iris" |
---|
165 | ] |
---|
166 | }, |
---|
167 | { |
---|
168 | "cell_type": "markdown", |
---|
169 | "metadata": {}, |
---|
170 | "source": [ |
---|
171 | "## Analyzing Housing Data\n", |
---|
172 | "Now we look at a different data set, a housing data set used in a Kaggle prediction competition" |
---|
173 | ] |
---|
174 | }, |
---|
175 | { |
---|
176 | "cell_type": "code", |
---|
177 | "execution_count": null, |
---|
178 | "metadata": {}, |
---|
179 | "outputs": [], |
---|
180 | "source": [ |
---|
181 | "houses = pd.read_csv('house_price_prediction/train.csv')\n", |
---|
182 | "houses.head()" |
---|
183 | ] |
---|
184 | }, |
---|
185 | { |
---|
186 | "cell_type": "code", |
---|
187 | "execution_count": null, |
---|
188 | "metadata": {}, |
---|
189 | "outputs": [], |
---|
190 | "source": [ |
---|
191 | "analysis_vars = ['LotArea', 'LotFrontage', '1stFlrSF', 'GrLivArea', 'MoSold', 'YrSold', 'YearBuilt', 'SalePrice' ]\n", |
---|
192 | "houses_real = houses[analysis_vars].dropna()\n", |
---|
193 | "houses_real.head()" |
---|
194 | ] |
---|
195 | }, |
---|
196 | { |
---|
197 | "cell_type": "code", |
---|
198 | "execution_count": null, |
---|
199 | "metadata": {}, |
---|
200 | "outputs": [], |
---|
201 | "source": [ |
---|
202 | "kmeans2 = cluster.KMeans(n_clusters=3)\n", |
---|
203 | "clabels2 = kmeans2.fit_predict(houses_real.dropna()) \n", |
---|
204 | "houses_real['kmeans_label'] = clabels2 \n", |
---|
205 | "sns.set(rc={'axes.facecolor':'lightslategray'})\n", |
---|
206 | "sns.pairplot(x_vars=['YearBuilt'], y_vars=['SalePrice'], data=houses_real, hue=\"kmeans_label\", size=6) " |
---|
207 | ] |
---|
208 | }, |
---|
209 | { |
---|
210 | "cell_type": "markdown", |
---|
211 | "metadata": {}, |
---|
212 | "source": [ |
---|
213 | "### Comment\n", |
---|
214 | "The plot abova actually gives us some useful information about what Kmeans is doing with this data. It is telling us that kmeans is splitting along the y-axis (Sale Price) and virtually ignoring the x-axis (YearBuilt). Why is this happening? And what might we do to the data to change this behavior?" |
---|
215 | ] |
---|
216 | }, |
---|
217 | { |
---|
218 | "cell_type": "code", |
---|
219 | "execution_count": null, |
---|
220 | "metadata": {}, |
---|
221 | "outputs": [], |
---|
222 | "source": [ |
---|
223 | "centroids = kmeans2.cluster_centers_ \n", |
---|
224 | "np.set_printoptions(suppress=True) # suppress scientific notation so that we can see the \"real\" values\n", |
---|
225 | "centroids" |
---|
226 | ] |
---|
227 | }, |
---|
228 | { |
---|
229 | "cell_type": "markdown", |
---|
230 | "metadata": {}, |
---|
231 | "source": [ |
---|
232 | "### Lets normalize the columns to have mean 0 and standard deviation 1" |
---|
233 | ] |
---|
234 | }, |
---|
235 | { |
---|
236 | "cell_type": "code", |
---|
237 | "execution_count": null, |
---|
238 | "metadata": {}, |
---|
239 | "outputs": [], |
---|
240 | "source": [ |
---|
241 | "tmp = houses_real.drop(['kmeans_label'],axis=1)\n", |
---|
242 | "tmp.head() \n", |
---|
243 | "np.set_printoptions(suppress=True) # suppress scientific notation so that we can see the \"real\" values\n", |
---|
244 | "#Norm_houses_real = tmp.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))\n", |
---|
245 | "Norm_houses_real = tmp.apply(lambda x: (x - np.mean(x)) / (np.std(x)) )\n", |
---|
246 | "Norm_houses_real.head()" |
---|
247 | ] |
---|
248 | }, |
---|
249 | { |
---|
250 | "cell_type": "code", |
---|
251 | "execution_count": null, |
---|
252 | "metadata": {}, |
---|
253 | "outputs": [], |
---|
254 | "source": [ |
---|
255 | "Norm_houses_real.describe()" |
---|
256 | ] |
---|
257 | }, |
---|
258 | { |
---|
259 | "cell_type": "code", |
---|
260 | "execution_count": null, |
---|
261 | "metadata": {}, |
---|
262 | "outputs": [], |
---|
263 | "source": [ |
---|
264 | "clabels2 = kmeans2.fit_predict(Norm_houses_real.dropna()) \n", |
---|
265 | "Norm_houses_real['kmeans_label'] = clabels2 \n", |
---|
266 | "houses_real['kmeans_label'] = clabels2 \n", |
---|
267 | "sns.set(rc={'axes.facecolor':'lightslategray'})\n", |
---|
268 | "sns.pairplot(x_vars=['YearBuilt'], y_vars=['SalePrice'], data=Norm_houses_real, hue=\"kmeans_label\", size=6) " |
---|
269 | ] |
---|
270 | }, |
---|
271 | { |
---|
272 | "cell_type": "code", |
---|
273 | "execution_count": null, |
---|
274 | "metadata": {}, |
---|
275 | "outputs": [], |
---|
276 | "source": [ |
---|
277 | "centroids = kmeans2.cluster_centers_ \n", |
---|
278 | "np.set_printoptions(suppress=True) # suppress scientific notation so that we can see the \"real\" values\n", |
---|
279 | "centroids" |
---|
280 | ] |
---|
281 | }, |
---|
282 | { |
---|
283 | "cell_type": "code", |
---|
284 | "execution_count": null, |
---|
285 | "metadata": {}, |
---|
286 | "outputs": [], |
---|
287 | "source": [ |
---|
288 | "sns.set_style(\"whitegrid\")\n", |
---|
289 | "plt.plot(centroids.transpose())\n", |
---|
290 | "ax.set_xlabel('Indices of Variables')\n", |
---|
291 | "ax.set_ylabel('Values of Variables')\n", |
---|
292 | "plt.show() " |
---|
293 | ] |
---|
294 | }, |
---|
295 | { |
---|
296 | "cell_type": "code", |
---|
297 | "execution_count": null, |
---|
298 | "metadata": {}, |
---|
299 | "outputs": [], |
---|
300 | "source": [ |
---|
301 | "sns.set(rc={'axes.facecolor':'lightslategray'})\n", |
---|
302 | "sns.pairplot(x_vars=['GrLivArea'], y_vars=['SalePrice'], data=Norm_houses_real, hue=\"kmeans_label\", size=6) " |
---|
303 | ] |
---|
304 | }, |
---|
305 | { |
---|
306 | "cell_type": "code", |
---|
307 | "execution_count": null, |
---|
308 | "metadata": {}, |
---|
309 | "outputs": [], |
---|
310 | "source": [ |
---|
311 | "sns.set(rc={'axes.facecolor':'lightslategray'})\n", |
---|
312 | "sns.pairplot(x_vars=['YearBuilt'], y_vars=['GrLivArea'], data=houses_real, hue=\"kmeans_label\", size=6) " |
---|
313 | ] |
---|
314 | }, |
---|
315 | { |
---|
316 | "cell_type": "code", |
---|
317 | "execution_count": null, |
---|
318 | "metadata": { |
---|
319 | "collapsed": true |
---|
320 | }, |
---|
321 | "outputs": [], |
---|
322 | "source": [] |
---|
323 | } |
---|
324 | ], |
---|
325 | "metadata": { |
---|
326 | "anaconda-cloud": {}, |
---|
327 | "kernelspec": { |
---|
328 | "display_name": "Python 3", |
---|
329 | "language": "python", |
---|
330 | "name": "python3" |
---|
331 | }, |
---|
332 | "language_info": { |
---|
333 | "codemirror_mode": { |
---|
334 | "name": "ipython", |
---|
335 | "version": 3 |
---|
336 | }, |
---|
337 | "file_extension": ".py", |
---|
338 | "mimetype": "text/x-python", |
---|
339 | "name": "python", |
---|
340 | "nbconvert_exporter": "python", |
---|
341 | "pygments_lexer": "ipython3", |
---|
342 | "version": "3.6.3" |
---|
343 | } |
---|
344 | }, |
---|
345 | "nbformat": 4, |
---|
346 | "nbformat_minor": 2 |
---|
347 | } |
---|