stats170ab-2018: clustering_demo.ipynb

File clustering_demo.ipynb, 8.9 KB (added by smyth, 3 years ago)
Line 
1{
2 "cells": [
3  {
4   "cell_type": "markdown",
5   "metadata": {},
6   "source": [
7    "# Clustering Algorithms\n",
8    "Stats 170A, Winter 2018\n",
9    "\n",
10    "Illustration of clustering algorithms"
11   ]
12  },
13  {
14   "cell_type": "code",
15   "execution_count": null,
16   "metadata": {
17    "collapsed": true
18   },
19   "outputs": [],
20   "source": [
21    "# First, we'll import pandas and numpy\n",
22    "import pandas as pd\n",
23    "import numpy as np\n",
24    "\n",
25    "# import relevant parts of sklearn\n",
26    "from sklearn import cluster, datasets\n",
27    "\n",
28    "# We'll also import seaborn, a Python graphing library\n",
29    "# (you may need to run >conda install seaborn (if using Anaconda)) \n",
30    "import seaborn as sns\n",
31    "\n",
32    "# and matplotlib\n",
33    "import matplotlib.pyplot as plt\n",
34    "sns.set(style=\"white\", color_codes=True)\n",
35    "%matplotlib inline\n",
36    "\n",
37    "# and turn off annoying warnings...(if we were writing \"real code\" we shouldn't do this)\n",
38    "import warnings \n",
39    "warnings.simplefilter('ignore')"
40   ]
41  },
42  {
43   "cell_type": "markdown",
44   "metadata": {},
45   "source": [
46    "## Load and Explore the Iris Data Set"
47   ]
48  },
49  {
50   "cell_type": "code",
51   "execution_count": null,
52   "metadata": {},
53   "outputs": [],
54   "source": [
55    "iris = pd.read_csv('iris.csv')\n",
56    "iris.head() "
57   ]
58  },
59  {
60   "cell_type": "markdown",
61   "metadata": {},
62   "source": [
63    "## Use Kmeans to Cluster the Iris Data"
64   ]
65  },
66  {
67   "cell_type": "code",
68   "execution_count": null,
69   "metadata": {},
70   "outputs": [],
71   "source": [
72    "cols = iris.columns[0:4]   # pull out the real-valued columns for clustering, ignore species column\n",
73    "X = iris[cols] \n",
74    "kmeans = cluster.KMeans(n_clusters=3)\n",
75    "clabels = kmeans.fit_predict(X) \n",
76    "iris['kmeans_label'] = clabels \n",
77    "iris"
78   ]
79  },
80  {
81   "cell_type": "markdown",
82   "metadata": {},
83   "source": [
84    "## Visually compare K-Means Clustering with True Species Labels"
85   ]
86  },
87  {
88   "cell_type": "code",
89   "execution_count": null,
90   "metadata": {},
91   "outputs": [],
92   "source": [
93    "sns.set(rc={'axes.facecolor':'lightslategray'})\n",
94    "sns.pairplot(x_vars=['sepal_length'], y_vars=['sepal_width'], data=iris, hue=\"kmeans_label\", size=4) \n",
95    "sns.pairplot(x_vars=['sepal_length'], y_vars=['sepal_width'], data=iris, hue=\"species\", size=4)"
96   ]
97  },
98  {
99   "cell_type": "markdown",
100   "metadata": {},
101   "source": [
102    "## 3d Plot of Iris Data with KMeans Cluster Labels"
103   ]
104  },
105  {
106   "cell_type": "code",
107   "execution_count": null,
108   "metadata": {},
109   "outputs": [],
110   "source": [
111    "from mpl_toolkits.mplot3d import Axes3D\n",
112    "%matplotlib \n",
113    "fig = plt.figure()\n",
114    "ax = fig.add_subplot(111, projection='3d')\n",
115    "ax.scatter(iris['sepal_length'], iris['sepal_width'], iris['petal_length'], c=iris.kmeans_label, s=60)\n",
116    "ax.view_init(30, 185)\n",
117    "ax.set_xlabel('Sepal Length')\n",
118    "ax.set_ylabel('Sepal Width')\n",
119    "ax.set_zlabel('Petal Length')\n",
120    "plt.show()"
121   ]
122  },
123  {
124   "cell_type": "markdown",
125   "metadata": {},
126   "source": [
127    "## Plot the Kmeans Centroids as 4-Dimensional Vectors"
128   ]
129  },
130  {
131   "cell_type": "code",
132   "execution_count": null,
133   "metadata": {},
134   "outputs": [],
135   "source": [
136    "%matplotlib inline\n",
137    "centroids = kmeans.cluster_centers_ \n",
138    "sns.set_style(\"whitegrid\")\n",
139    "plt.plot(centroids.transpose())\n",
140    "ax.set_xlabel('Indices of Variables')\n",
141    "ax.set_ylabel('Values of Variables')\n",
142    "plt.show()\n",
143    "centroids"
144   ]
145  },
146  {
147   "cell_type": "markdown",
148   "metadata": {},
149   "source": [
150    "## Use Hierarchical (Agglomerative) Clustering to Cluster the Iris Data \n",
151    "... and we will compare to the Kmeans clustering"
152   ]
153  },
154  {
155   "cell_type": "code",
156   "execution_count": null,
157   "metadata": {},
158   "outputs": [],
159   "source": [
160    "hlabels = cluster.AgglomerativeClustering(n_clusters=3, linkage='ward').fit_predict(X) \n",
161    "iris['hier_label'] = hlabels \n",
162    "print(hlabels[::10])\n",
163    "print(clabels[::10])\n",
164    "iris"
165   ]
166  },
167  {
168   "cell_type": "markdown",
169   "metadata": {},
170   "source": [
171    "## Analyzing Housing Data\n",
172    "Now we look at a different data set, a housing data set used in a Kaggle prediction competition"
173   ]
174  },
175  {
176   "cell_type": "code",
177   "execution_count": null,
178   "metadata": {},
179   "outputs": [],
180   "source": [
181    "houses = pd.read_csv('house_price_prediction/train.csv')\n",
182    "houses.head()"
183   ]
184  },
185  {
186   "cell_type": "code",
187   "execution_count": null,
188   "metadata": {},
189   "outputs": [],
190   "source": [
191    "analysis_vars = ['LotArea', 'LotFrontage', '1stFlrSF', 'GrLivArea', 'MoSold', 'YrSold',  'YearBuilt',   'SalePrice'  ]\n",
192    "houses_real = houses[analysis_vars].dropna()\n",
193    "houses_real.head()"
194   ]
195  },
196  {
197   "cell_type": "code",
198   "execution_count": null,
199   "metadata": {},
200   "outputs": [],
201   "source": [
202    "kmeans2 = cluster.KMeans(n_clusters=3)\n",
203    "clabels2 = kmeans2.fit_predict(houses_real.dropna()) \n",
204    "houses_real['kmeans_label'] = clabels2 \n",
205    "sns.set(rc={'axes.facecolor':'lightslategray'})\n",
206    "sns.pairplot(x_vars=['YearBuilt'], y_vars=['SalePrice'], data=houses_real, hue=\"kmeans_label\", size=6) "
207   ]
208  },
209  {
210   "cell_type": "markdown",
211   "metadata": {},
212   "source": [
213    "### Comment\n",
214    "The plot abova actually gives us some useful information about what Kmeans is doing with this data. It is telling us that kmeans is splitting along the y-axis (Sale Price) and virtually ignoring the x-axis (YearBuilt). Why is this happening? And what might we do to the data to change this behavior?"
215   ]
216  },
217  {
218   "cell_type": "code",
219   "execution_count": null,
220   "metadata": {},
221   "outputs": [],
222   "source": [
223    "centroids = kmeans2.cluster_centers_ \n",
224    "np.set_printoptions(suppress=True)  # suppress scientific notation so that we can see the \"real\" values\n",
225    "centroids"
226   ]
227  },
228  {
229   "cell_type": "markdown",
230   "metadata": {},
231   "source": [
232    "### Lets normalize the columns to have mean 0 and standard deviation 1"
233   ]
234  },
235  {
236   "cell_type": "code",
237   "execution_count": null,
238   "metadata": {},
239   "outputs": [],
240   "source": [
241    "tmp = houses_real.drop(['kmeans_label'],axis=1)\n",
242    "tmp.head() \n",
243    "np.set_printoptions(suppress=True)  # suppress scientific notation so that we can see the \"real\" values\n",
244    "#Norm_houses_real = tmp.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))\n",
245    "Norm_houses_real = tmp.apply(lambda x: (x - np.mean(x)) / (np.std(x)) )\n",
246    "Norm_houses_real.head()"
247   ]
248  },
249  {
250   "cell_type": "code",
251   "execution_count": null,
252   "metadata": {},
253   "outputs": [],
254   "source": [
255    "Norm_houses_real.describe()"
256   ]
257  },
258  {
259   "cell_type": "code",
260   "execution_count": null,
261   "metadata": {},
262   "outputs": [],
263   "source": [
264    "clabels2 = kmeans2.fit_predict(Norm_houses_real.dropna()) \n",
265    "Norm_houses_real['kmeans_label'] = clabels2 \n",
266    "houses_real['kmeans_label'] = clabels2 \n",
267    "sns.set(rc={'axes.facecolor':'lightslategray'})\n",
268    "sns.pairplot(x_vars=['YearBuilt'], y_vars=['SalePrice'], data=Norm_houses_real, hue=\"kmeans_label\", size=6) "
269   ]
270  },
271  {
272   "cell_type": "code",
273   "execution_count": null,
274   "metadata": {},
275   "outputs": [],
276   "source": [
277    "centroids = kmeans2.cluster_centers_ \n",
278    "np.set_printoptions(suppress=True)  # suppress scientific notation so that we can see the \"real\" values\n",
279    "centroids"
280   ]
281  },
282  {
283   "cell_type": "code",
284   "execution_count": null,
285   "metadata": {},
286   "outputs": [],
287   "source": [
288    "sns.set_style(\"whitegrid\")\n",
289    "plt.plot(centroids.transpose())\n",
290    "ax.set_xlabel('Indices of Variables')\n",
291    "ax.set_ylabel('Values of Variables')\n",
292    "plt.show() "
293   ]
294  },
295  {
296   "cell_type": "code",
297   "execution_count": null,
298   "metadata": {},
299   "outputs": [],
300   "source": [
301    "sns.set(rc={'axes.facecolor':'lightslategray'})\n",
302    "sns.pairplot(x_vars=['GrLivArea'], y_vars=['SalePrice'], data=Norm_houses_real, hue=\"kmeans_label\", size=6) "
303   ]
304  },
305  {
306   "cell_type": "code",
307   "execution_count": null,
308   "metadata": {},
309   "outputs": [],
310   "source": [
311    "sns.set(rc={'axes.facecolor':'lightslategray'})\n",
312    "sns.pairplot(x_vars=['YearBuilt'], y_vars=['GrLivArea'], data=houses_real, hue=\"kmeans_label\", size=6) "
313   ]
314  },
315  {
316   "cell_type": "code",
317   "execution_count": null,
318   "metadata": {
319    "collapsed": true
320   },
321   "outputs": [],
322   "source": []
323  }
324 ],
325 "metadata": {
326  "anaconda-cloud": {},
327  "kernelspec": {
328   "display_name": "Python 3",
329   "language": "python",
330   "name": "python3"
331  },
332  "language_info": {
333   "codemirror_mode": {
334    "name": "ipython",
335    "version": 3
336   },
337   "file_extension": ".py",
338   "mimetype": "text/x-python",
339   "name": "python",
340   "nbconvert_exporter": "python",
341   "pygments_lexer": "ipython3",
342   "version": "3.6.3"
343  }
344 },
345 "nbformat": 4,
346 "nbformat_minor": 2
347}