diff --git a/notebooks/01-exploratory_data_analysis.ipynb b/notebooks/01-exploratory_data_analysis.ipynb index 8a8a8d9..98df65a 100644 --- a/notebooks/01-exploratory_data_analysis.ipynb +++ b/notebooks/01-exploratory_data_analysis.ipynb @@ -13,22 +13,600 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd" + "from IPython.display import display, Markdown\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 1. Descrição dos dados" + "## Leitura do conjunto de dados\n", + "***" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeGenderHeightWeightCALCFAVCFCVCNCPSCCSMOKECH2Ofamily_history_with_overweightFAFTUECAECMTRANSNObeyesdad
021.000000Female1.62000064.000000nono2.03.0nono2.000000yes0.0000001.000000SometimesPublic_TransportationNormal_Weight
121.000000Female1.52000056.000000Sometimesno3.03.0yesyes3.000000yes3.0000000.000000SometimesPublic_TransportationNormal_Weight
223.000000Male1.80000077.000000Frequentlyno2.03.0nono2.000000yes2.0000001.000000SometimesPublic_TransportationNormal_Weight
327.000000Male1.80000087.000000Frequentlyno3.03.0nono2.000000no2.0000000.000000SometimesWalkingOverweight_Level_I
422.000000Male1.78000089.800000Sometimesno2.01.0nono2.000000no0.0000000.000000SometimesPublic_TransportationOverweight_Level_II
......................................................
210620.976842Female1.710730131.408528Sometimesyes3.03.0nono1.728139yes1.6762690.906247SometimesPublic_TransportationObesity_Type_III
210721.982942Female1.748584133.742943Sometimesyes3.03.0nono2.005130yes1.3413900.599270SometimesPublic_TransportationObesity_Type_III
210822.524036Female1.752206133.689352Sometimesyes3.03.0nono2.054193yes1.4142090.646288SometimesPublic_TransportationObesity_Type_III
210924.361936Female1.739450133.346641Sometimesyes3.03.0nono2.852339yes1.1391070.586035SometimesPublic_TransportationObesity_Type_III
211023.664709Female1.738836133.472641Sometimesyes3.03.0nono2.863513yes1.0264520.714137SometimesPublic_TransportationObesity_Type_III
\n", + "

2111 rows × 17 columns

\n", + "
" + ], + "text/plain": [ + " Age Gender Height Weight CALC FAVC FCVC NCP \\\n", + "0 21.000000 Female 1.620000 64.000000 no no 2.0 3.0 \n", + "1 21.000000 Female 1.520000 56.000000 Sometimes no 3.0 3.0 \n", + "2 23.000000 Male 1.800000 77.000000 Frequently no 2.0 3.0 \n", + "3 27.000000 Male 1.800000 87.000000 Frequently no 3.0 3.0 \n", + "4 22.000000 Male 1.780000 89.800000 Sometimes no 2.0 1.0 \n", + "... ... ... ... ... ... ... ... ... \n", + "2106 20.976842 Female 1.710730 131.408528 Sometimes yes 3.0 3.0 \n", + "2107 21.982942 Female 1.748584 133.742943 Sometimes yes 3.0 3.0 \n", + "2108 22.524036 Female 1.752206 133.689352 Sometimes yes 3.0 3.0 \n", + "2109 24.361936 Female 1.739450 133.346641 Sometimes yes 3.0 3.0 \n", + "2110 23.664709 Female 1.738836 133.472641 Sometimes yes 3.0 3.0 \n", + "\n", + " SCC SMOKE CH2O family_history_with_overweight FAF TUE \\\n", + "0 no no 2.000000 yes 0.000000 1.000000 \n", + "1 yes yes 3.000000 yes 3.000000 0.000000 \n", + "2 no no 2.000000 yes 2.000000 1.000000 \n", + "3 no no 2.000000 no 2.000000 0.000000 \n", + "4 no no 2.000000 no 0.000000 0.000000 \n", + "... ... ... ... ... ... ... \n", + "2106 no no 1.728139 yes 1.676269 0.906247 \n", + "2107 no no 2.005130 yes 1.341390 0.599270 \n", + "2108 no no 2.054193 yes 1.414209 0.646288 \n", + "2109 no no 2.852339 yes 1.139107 0.586035 \n", + "2110 no no 2.863513 yes 1.026452 0.714137 \n", + "\n", + " CAEC MTRANS NObeyesdad \n", + "0 Sometimes Public_Transportation Normal_Weight \n", + "1 Sometimes Public_Transportation Normal_Weight \n", + "2 Sometimes Public_Transportation Normal_Weight \n", + "3 Sometimes Walking Overweight_Level_I \n", + "4 Sometimes Public_Transportation Overweight_Level_II \n", + "... ... ... ... \n", + "2106 Sometimes Public_Transportation Obesity_Type_III \n", + "2107 Sometimes Public_Transportation Obesity_Type_III \n", + "2108 Sometimes Public_Transportation Obesity_Type_III \n", + "2109 Sometimes Public_Transportation Obesity_Type_III \n", + "2110 Sometimes Public_Transportation Obesity_Type_III \n", + "\n", + "[2111 rows x 17 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df = pd.read_csv('../data/raw/obesity_dataset.csv')\n", + "display(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
variaveldescricaotiposubtipo
0Ageidade do indivíduoquantitativacontínua
1Gendergênero do indivíduoqualitativanominal
2Heightaltura do indiíduoquantitativacontínua
3Weightpeso do indivíduoquantitativacontínua
4CALCfrequência do consumo de álcool pelo indivíduoqualitativaordinal
5FAVCindica se o indivíduo consome comidas altamentes calóricas com frequênciaqualitativanominal
6FCVCindica o nível de consumo de vegetais nas refeições do indivíduoquantitativadiscreta
7NCPquantas refeições principais o indivíduo faz diariamentequantitativacontínua
8SCCindica se o indivíduo monitora as calorias ingeridas diariamentequalitativanominal
9SMOKEindica se o indivíduo fuma ou nãoqualitativanominal
10CH2Oquanta água o indivíduo consome diariamentequantitativacontínua
11family_history_with_overweightindica se algum membro da família do indivídio sofreu ou sofre com excesso de pesoqualitativanominal
12FAFquão frequentemente o indivíduo pratica atividades físicasquantitativacontínua
13TUEquanto tempo o indivíduo passa usando dispositivos tecnológicosquantitativacontínua
14CAECfrequência em que o indivíduo come algum alimento entre as refeiçõesqualitativaordinal
15MTRANStipo de transporte que o indivíduo costuma usarqualitativanominal
16NObeyesdadnível de obesidade do indivíduoqualitativaordinal
\n", + "
" + ], + "text/plain": [ + " variavel \\\n", + "0 Age \n", + "1 Gender \n", + "2 Height \n", + "3 Weight \n", + "4 CALC \n", + "5 FAVC \n", + "6 FCVC \n", + "7 NCP \n", + "8 SCC \n", + "9 SMOKE \n", + "10 CH2O \n", + "11 family_history_with_overweight \n", + "12 FAF \n", + "13 TUE \n", + "14 CAEC \n", + "15 MTRANS \n", + "16 NObeyesdad \n", + "\n", + " descricao \\\n", + "0 idade do indivíduo \n", + "1 gênero do indivíduo \n", + "2 altura do indiíduo \n", + "3 peso do indivíduo \n", + "4 frequência do consumo de álcool pelo indivíduo \n", + "5 indica se o indivíduo consome comidas altamentes calóricas com frequência \n", + "6 indica o nível de consumo de vegetais nas refeições do indivíduo \n", + "7 quantas refeições principais o indivíduo faz diariamente \n", + "8 indica se o indivíduo monitora as calorias ingeridas diariamente \n", + "9 indica se o indivíduo fuma ou não \n", + "10 quanta água o indivíduo consome diariamente \n", + "11 indica se algum membro da família do indivídio sofreu ou sofre com excesso de peso \n", + "12 quão frequentemente o indivíduo pratica atividades físicas \n", + "13 quanto tempo o indivíduo passa usando dispositivos tecnológicos \n", + "14 frequência em que o indivíduo come algum alimento entre as refeições \n", + "15 tipo de transporte que o indivíduo costuma usar \n", + "16 nível de obesidade do indivíduo \n", + "\n", + " tipo subtipo \n", + "0 quantitativa contínua \n", + "1 qualitativa nominal \n", + "2 quantitativa contínua \n", + "3 quantitativa contínua \n", + "4 qualitativa ordinal \n", + "5 qualitativa nominal \n", + "6 quantitativa discreta \n", + "7 quantitativa contínua \n", + "8 qualitativa nominal \n", + "9 qualitativa nominal \n", + "10 quantitativa contínua \n", + "11 qualitativa nominal \n", + "12 quantitativa contínua \n", + "13 quantitativa contínua \n", + "14 qualitativa ordinal \n", + "15 qualitativa nominal \n", + "16 qualitativa ordinal " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dicionario = pd.read_csv('../data/external/dictionary.csv')\n", + "pd.set_option('display.max_colwidth', None)\n", + "dicionario" + ] + }, + { + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "## 1. Descrição dos dados" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![image.png](https://jaleko-blog-files.s3.amazonaws.com/wp-content/uploads/2020/11/05142745/large-Dia-combate-obesidade-810x693.png)!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "[Obesity Level](https://www.kaggle.com/datasets/fatemehmehrparvar/obesity-levels) é um conjunto de dados com informações para a estimativa dos níveis de obesidade em indivíduos dos países México, Peru e Colômbia, com base em seus hábitos alimentares e condição física.\n", + "\n", + "Em projetos de *machine learning*, a variável ```NObeyesdad``` será a variável *target*, enquando as demais variáveis serão as *features*\n", + "\n", + "Estes dados foram disponibilizados por [Fatemeh Mehrparvar](https://www.kaggle.com/fatemehmehrparvar)" + ] }, { "cell_type": "markdown",