From 0e381bf47babae00536317a7d58ef679b272e862 Mon Sep 17 00:00:00 2001 From: Manuel Thalmann Date: Sat, 10 Jun 2023 14:18:49 +0200 Subject: [PATCH] Solve MLDM Lab01 --- .../L01_Data_Processing_LAB_ASSIGNMENT.ipynb | 562 ++++++++++++++++-- Pipfile | 1 + Pipfile.lock | 10 +- 3 files changed, 527 insertions(+), 46 deletions(-) diff --git a/Notes/Semester 4/MLDM - Machine Learning and Data Mining/Labs/L01_Data_Processing_LAB_ASSIGNMENT.ipynb b/Notes/Semester 4/MLDM - Machine Learning and Data Mining/Labs/L01_Data_Processing_LAB_ASSIGNMENT.ipynb index ba641f7..fd09056 100644 --- a/Notes/Semester 4/MLDM - Machine Learning and Data Mining/Labs/L01_Data_Processing_LAB_ASSIGNMENT.ipynb +++ b/Notes/Semester 4/MLDM - Machine Learning and Data Mining/Labs/L01_Data_Processing_LAB_ASSIGNMENT.ipynb @@ -6,19 +6,7 @@ "metadata": { "id": "8bPV9aEwTKC8" }, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'sklearn'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[1], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mnumpy\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mnp\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mmatplotlib\u001b[39;00m \u001b[39mimport\u001b[39;00m pyplot \u001b[39mas\u001b[39;00m plt\n\u001b[0;32m----> 3\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39msklearn\u001b[39;00m\n\u001b[1;32m 4\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mpandas\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mpd\u001b[39;00m\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'sklearn'" - ] - } - ], + "outputs": [], "source": [ "import numpy as np\n", "from matplotlib import pyplot as plt\n", @@ -28,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "id": "jFHJbjkfeepf" }, @@ -54,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "id": "em6VCOuE6MRU" }, @@ -66,22 +54,310 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "id": "HJoAuMNR6MgM" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesproline
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.0
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.0
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.0
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.0
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.0
..........................................
17313.715.652.4520.595.01.680.610.521.067.700.641.74740.0
17413.403.912.4823.0102.01.800.750.431.417.300.701.56750.0
17513.274.282.2620.0120.01.590.690.431.3510.200.591.56835.0
17613.172.592.3720.0120.01.650.680.531.469.300.601.62840.0
17714.134.102.7424.596.02.050.760.561.359.200.611.60560.0
\n", + "

178 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + ".. ... ... ... ... ... ... \n", + "173 13.71 5.65 2.45 20.5 95.0 1.68 \n", + "174 13.40 3.91 2.48 23.0 102.0 1.80 \n", + "175 13.27 4.28 2.26 20.0 120.0 1.59 \n", + "176 13.17 2.59 2.37 20.0 120.0 1.65 \n", + "177 14.13 4.10 2.74 24.5 96.0 2.05 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + ".. ... ... ... ... ... \n", + "173 0.61 0.52 1.06 7.70 0.64 \n", + "174 0.75 0.43 1.41 7.30 0.70 \n", + "175 0.69 0.43 1.35 10.20 0.59 \n", + "176 0.68 0.53 1.46 9.30 0.60 \n", + "177 0.76 0.56 1.35 9.20 0.61 \n", + "\n", + " od280/od315_of_diluted_wines proline \n", + "0 3.92 1065.0 \n", + "1 3.40 1050.0 \n", + "2 3.17 1185.0 \n", + "3 3.45 1480.0 \n", + "4 2.93 735.0 \n", + ".. ... ... \n", + "173 1.74 740.0 \n", + "174 1.56 750.0 \n", + "175 1.56 835.0 \n", + "176 1.62 840.0 \n", + "177 1.60 560.0 \n", + "\n", + "[178 rows x 13 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "data" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "id": "xrsPKm3w6Mi-" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 0\n", + "1 0\n", + "2 0\n", + "3 0\n", + "4 0\n", + " ..\n", + "173 2\n", + "174 2\n", + "175 2\n", + "176 2\n", + "177 2\n", + "Name: target, Length: 178, dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "target" ] @@ -99,7 +375,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "id": "m1w8dDgw6MoO" }, @@ -123,7 +399,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "id": "pvm_zBOe-e_X" }, @@ -165,12 +441,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "id": "5ToW8fx4ANZ8" }, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The accuracy of the Decision Tree classifier is 0.9661016949152542\n", + "The accuracy of the Support Vector classifier is 0.711864406779661\n" + ] + } + ], + "source": [ + "run_decision_tree(X_train, y_train, X_test, y_test)\n", + "run_svc(X_train, y_train, X_test, y_test)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "_The Decision Tree Classifier seems to be working better._" + ] }, { "attachments": {}, @@ -192,12 +488,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "id": "K0qkP9TqBRft" }, "outputs": [], - "source": [] + "source": [ + "def std_norm(data):\n", + " return (data - X_train.mean()) / X_train.std()\n", + "\n", + "X_train_std_norm = std_norm(X_train)\n", + "X_test_std_norm = std_norm(X_test)" + ] }, { "attachments": {}, @@ -211,12 +513,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "id": "TFg6WbmgBShk" }, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The accuracy of the Support Vector classifier is 0.9830508474576272\n", + "The accuracy of the Decision Tree classifier is 0.9661016949152542\n" + ] + } + ], + "source": [ + "run_svc(X_train_std_norm, y_train, X_test_std_norm, y_test)\n", + "run_decision_tree(X_train_std_norm, y_train, X_test_std_norm, y_test)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "_Now the Support Vector Classifier is more accurate - however, both have a very high accuracy._" + ] }, { "attachments": {}, @@ -236,12 +558,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { "id": "i25XenppJ7gf" }, "outputs": [], - "source": [] + "source": [ + "def min_max_norm(data):\n", + " return (data - X_train.min()) / (X_train.max() - X_train.min())\n", + "\n", + "X_train_min_max_norm = min_max_norm(X_train)\n", + "X_test_min_max_norm = min_max_norm(X_test)" + ] }, { "attachments": {}, @@ -255,12 +583,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": { "id": "99uuR7ngJ7gr" }, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The accuracy of the Support Vector classifier is 0.9830508474576272\n", + "The accuracy of the Decision Tree classifier is 0.9661016949152542\n" + ] + } + ], + "source": [ + "run_svc(X_train_min_max_norm, y_train, X_test_min_max_norm, y_test)\n", + "run_decision_tree(X_train_min_max_norm, y_train, X_test_min_max_norm, y_test)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "_The accuracy does not change._" + ] }, { "attachments": {}, @@ -289,7 +637,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": { "id": "w9qp3e4nBTPK" }, @@ -319,12 +667,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": { "id": "-lc07hbiOvYu" }, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "for data in [\n", + " [X_train, \"Un-Normalized\"],\n", + " [X_train_std_norm, \"Standard Deviation\"],\n", + " [X_train_min_max_norm, \"Min/Max Normalized\"]]:\n", + " sns.scatterplot(data[0], x=\"alcohol\", y=\"malic_acid\", label=data[1])" + ] }, { "attachments": {}, @@ -346,12 +711,112 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": { "id": "J3D06pyKQjGq" }, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AverageStandard DeviationMinimumMaximumRange
Un-Normalized1.297101e+010.85197511.03000014.8300003.800000
Standard Deviation-1.160603e-151.000000-2.2782452.1819794.460224
Min/Max Normalized5.107917e-010.2242040.0000001.0000001.000000
\n", + "
" + ], + "text/plain": [ + " Average Standard Deviation Minimum Maximum \\\n", + "Un-Normalized 1.297101e+01 0.851975 11.030000 14.830000 \n", + "Standard Deviation -1.160603e-15 1.000000 -2.278245 2.181979 \n", + "Min/Max Normalized 5.107917e-01 0.224204 0.000000 1.000000 \n", + "\n", + " Range \n", + "Un-Normalized 3.800000 \n", + "Standard Deviation 4.460224 \n", + "Min/Max Normalized 1.000000 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = pd.DataFrame(\n", + " [\n", + " [\n", + " selector(dataset[\"alcohol\"])\n", + " for selector in [\n", + " lambda column : column.mean(),\n", + " lambda column : column.std(),\n", + " lambda column : column.min(),\n", + " lambda column : column.max(),\n", + " lambda column : column.max() - column.min()]\n", + " ]\n", + " for dataset in [X_train, X_train_std_norm, X_train_min_max_norm]\n", + " ],\n", + " columns=[\"Average\", \"Standard Deviation\", \"Minimum\", \"Maximum\", \"Range\"],\n", + " index=[\"Un-Normalized\", \"Standard Deviation\", \"Min/Max Normalized\"])\n", + "\n", + "data" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As expected, the _Standard Deviation Normalization_ causes the the standard deviation to equal $1$. The _Min/Max Normalization_ causes all values to be in the range of $0..1$ making the range equal $1$." + ] }, { "attachments": {}, @@ -415,7 +880,11 @@ "id": "eukBUnVs4aZE" }, "source": [ - "❗ TODO ❗" + "❗ TODO ❗\n", + " 1. $\\frac{70 - 13}{3} = 19$ $\\rightarrow$ `[[13-32], [32-51], [51-70]]`\n", + " 2. `[[13, 15, 16, 18, 19, 20, 20, 21, 22, 22, 25, 25, 26, 26, 30], [33, 34, 35, 35, 35, 36, 37, 40, 42, 46], [53, 70]]`\n", + " 3. `[21.2, 37.3, 61.5]`\n", + " 4. `[21.2, 21.2, 21.2, 21.2, 21.2, 21.2, 21.2, 21.2, 21.2, 21.2, 21.2, 21.2, 21.2, 21.2, 21.2, 37.3, 37.3, 37.3, 37.3, 37.3, 37.3, 37.3, 37.3, 37.3, 37.3, 61.5, 61.5]`" ] }, { @@ -445,8 +914,11 @@ "id": "Vhf3wkSm4aZF" }, "source": [ - "\n", - "❗ TODO ❗" + "❗ TODO ❗\n", + " 1. $9$\n", + " 2. `[[13, 15, 16, 18, 19, 20, 20, 21, 22], [22, 25, 25, 26, 26, 30, 33, 34, 35], [35, 35, 36, 37, 40, 42, 46, 53, 70]]`\n", + " 3. `[18.222, 28.444, 43.778]`\n", + " 4. `[18, 18, 18, 18, 18, 18, 18, 18, 18, 28, 28, 28, 28, 28, 28, 28, 28, 28, 43, 43, 43, 43, 43, 43, 43, 43, 43]`" ] }, { @@ -484,7 +956,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.11.3" }, "vscode": { "interpreter": { diff --git a/Pipfile b/Pipfile index 4a40bfe..ae45df1 100644 --- a/Pipfile +++ b/Pipfile @@ -14,5 +14,6 @@ ipympl = "*" sympy = "*" autopep8 = "*" scikit-learn = "*" +seaborn = "*" [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index ed5bfe3..2880a61 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "b14e3d25c62a40762f737e4cc3c6985e5d34c5ab6f9e40553295a2593f53996b" + "sha256": "84b9c7bb8fd900578a1275894f11a3084686fba998fbd93a9409ab5121dd0bcb" }, "pipfile-spec": 6, "requires": {}, @@ -777,6 +777,14 @@ "index": "pypi", "version": "==1.10.1" }, + "seaborn": { + "hashes": [ + "sha256:374645f36509d0dcab895cba5b47daf0586f77bfe3b36c97c607db7da5be0139", + "sha256:ebf15355a4dba46037dfd65b7350f014ceb1f13c05e814eda2c9f5fd731afc08" + ], + "index": "pypi", + "version": "==0.12.2" + }, "six": { "hashes": [ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",