From c0fa045423111123543785e6a4d32653e80ee26f Mon Sep 17 00:00:00 2001 From: Justin Bois Date: Wed, 19 Jun 2024 21:45:49 -0700 Subject: [PATCH] Added exercise 2 solutions. --- .../exercise_2.1_solution.ipynb.txt | 262 ++ .../exercise_2.2_solution.ipynb.txt | 341 ++ .../exercise_2.3_solution.ipynb.txt | 740 ++++ .../exercise_2.4_solution.ipynb.txt | 864 +++++ .../exercise_2/index.rst.txt | 11 + .../lessons/bootcamp_live/Untitled.ipynb.txt | 1637 --------- .../lessons/merging_dataframes.ipynb.txt | 2975 +++++++++++++++++ .../lessons/stacking_and_unstacking.ipynb.txt | 2358 +++++++++++++ .../exercise_1/exercise_1.1_solution.html | 5 +- .../exercise_1/exercise_1.2_solution.html | 5 +- .../exercise_1/exercise_1.3_solution.html | 5 +- .../exercise_1/exercise_1.4_solution.html | 7 +- .../exercise_1/exercise_1.5_solution.html | 5 +- .../exercise_1/exercise_1.6_solution.html | 9 +- 2024/exercise_solutions/exercise_1/index.html | 5 +- .../exercise_2/exercise_2.1_solution.html | 385 +++ .../exercise_2/exercise_2.1_solution.ipynb | 262 ++ .../exercise_2/exercise_2.2_solution.html | 456 +++ .../exercise_2/exercise_2.2_solution.ipynb | 341 ++ .../exercise_2/exercise_2.3_solution.html | 755 +++++ .../exercise_2/exercise_2.3_solution.ipynb | 740 ++++ .../exercise_2/exercise_2.4_solution.html} | 1709 +++------- .../exercise_2/exercise_2.4_solution.ipynb | 864 +++++ 2024/exercise_solutions/exercise_2/index.html | 249 ++ 2024/exercises/exercise_1/exercise_1.1.html | 5 +- 2024/exercises/exercise_1/exercise_1.2.html | 5 +- 2024/exercises/exercise_1/exercise_1.3.html | 7 +- 2024/exercises/exercise_1/exercise_1.4.html | 5 +- 2024/exercises/exercise_1/exercise_1.5.html | 5 +- 2024/exercises/exercise_1/exercise_1.6.html | 5 +- 2024/exercises/exercise_1/index.html | 5 +- 2024/exercises/exercise_2/exercise_2.1.html | 5 +- 2024/exercises/exercise_2/exercise_2.2.html | 5 +- 2024/exercises/exercise_2/exercise_2.3.html | 5 +- 2024/exercises/exercise_2/exercise_2.4.html | 5 +- 2024/exercises/exercise_2/index.html | 5 +- 2024/exercises/exercise_3/exercise_3.1.html | 7 +- 2024/exercises/exercise_3/exercise_3.2.html | 5 +- 2024/exercises/exercise_3/exercise_3.3.html | 7 +- 2024/exercises/exercise_3/exercise_3.4.html | 7 +- 2024/exercises/exercise_3/exercise_3.5.html | 5 +- 2024/exercises/exercise_3/index.html | 5 +- 2024/exercises/exercise_4/exercise_4.1.html | 5 +- 2024/exercises/exercise_4/exercise_4.2.html | 7 +- 2024/exercises/exercise_4/exercise_4.3.html | 7 +- 2024/exercises/exercise_4/exercise_4.4.html | 7 +- 2024/exercises/exercise_4/exercise_4.5.html | 5 +- 2024/exercises/exercise_4/index.html | 5 +- 2024/exercises/exercise_5/exercise_5.1.html | 5 +- 2024/exercises/exercise_5/exercise_5.10.html | 5 +- 2024/exercises/exercise_5/exercise_5.11.html | 5 +- 2024/exercises/exercise_5/exercise_5.12.html | 5 +- 2024/exercises/exercise_5/exercise_5.2.html | 5 +- 2024/exercises/exercise_5/exercise_5.3.html | 5 +- 2024/exercises/exercise_5/exercise_5.4.html | 5 +- 2024/exercises/exercise_5/exercise_5.5.html | 5 +- 2024/exercises/exercise_5/exercise_5.6.html | 5 +- 2024/exercises/exercise_5/exercise_5.7.html | 5 +- 2024/exercises/exercise_5/exercise_5.8.html | 5 +- 2024/exercises/exercise_5/exercise_5.9.html | 5 +- 2024/exercises/exercise_5/index.html | 5 +- 2024/genindex.html | 5 +- 2024/index.html | 6 +- 2024/lessons/Untitled.html | 5 +- 2024/lessons/Untitled1.html | 5 +- 2024/lessons/Untitled2.html | 5 +- 2024/lessons/bootcamp_live/Untitled.ipynb | 1637 --------- .../l00_configuring_your_computer.html | 5 +- 2024/lessons/l01_welcome.html | 5 +- .../l02_basic_command_line_skills.html | 5 +- .../l03_variables_operators_types.html | 5 +- .../l04_more_operators_and_conditionals.html | 5 +- 2024/lessons/l05_lists_and_tuples.html | 5 +- 2024/lessons/l06_iteration.html | 5 +- 2024/lessons/l07_intro_to_functions.html | 5 +- 2024/lessons/l08_string_methods.html | 5 +- 2024/lessons/l09_dictionaries.html | 5 +- 2024/lessons/l10_packages_and_modules.html | 5 +- 2024/lessons/l11_file_io.html | 5 +- .../lessons/l12_version_control_with_git.html | 7 +- .../l13_exceptions_and_error_handling.html | 5 +- 2024/lessons/l14_style.html | 5 +- 2024/lessons/l15_comprehensions.html | 5 +- 2024/lessons/l16_intro_to_pandas.html | 5 +- 2024/lessons/l17_split_apply_combine.html | 5 +- 2024/lessons/l18_plotting.html | 5 +- 2024/lessons/l19_high_level_plotting.html | 5 +- 2024/lessons/l20_styling_bokeh.html | 5 +- .../lessons/l21_intro_to_numpy_and_scipy.html | 5 +- ...2_plotting_time_series_generated_data.html | 5 +- .../l23_other_packages_and_languages.html | 5 +- 2024/lessons/l24_bootcamp_recap.html | 5 +- .../lessons/l25_random_number_generation.html | 5 +- 2024/lessons/l26_hackerstats_1.html | 5 +- 2024/lessons/l27_hackerstats_2.html | 5 +- 2024/lessons/l28_dashboards.html | 5 +- 2024/lessons/l29_javascript_for_bokeh.html | 5 +- 2024/lessons/l30_control_of_devices.html | 5 +- .../l31_apps_for_external_devices.html | 5 +- 2024/lessons/l32_control_panels.html | 5 +- 2024/lessons/l33_more_command_line.html | 5 +- 2024/lessons/l34_regular_expressions.html | 5 +- 2024/lessons/l35_intro_to_scripting.html | 5 +- 2024/lessons/l36_intro_to_oop.html | 5 +- 2024/lessons/l37_algorithmic_complexity.html | 5 +- 2024/lessons/l38_testing_and_tdd.html | 5 +- 2024/lessons/l39_examples_of_tdd.html | 5 +- 2024/lessons/l40_holoviews.html | 5 +- 2024/lessons/l41_altair.html | 5 +- 2024/lessons/l42_more_altair.html | 5 +- 2024/lessons/l43_overplotting.html | 5 +- .../l44_intro_to_image_processing.html | 5 +- 2024/lessons/l45_segmentation.html | 5 +- .../lessons/l46_plotting_with_matplotlib.html | 5 +- 2024/lessons/merging_dataframes.html | 2408 +++++++++++++ 2024/lessons/merging_dataframes.ipynb | 2975 +++++++++++++++++ 2024/lessons/stacking_and_unstacking.html | 2102 ++++++++++++ 2024/lessons/stacking_and_unstacking.ipynb | 2358 +++++++++++++ 2024/objects.inv | Bin 32123 -> 33300 bytes 2024/resources.html | 5 +- 2024/schedule.html | 11 +- 2024/search.html | 5 +- 2024/searchindex.js | 2 +- 123 files changed, 22266 insertions(+), 4689 deletions(-) create mode 100644 2024/_sources/exercise_solutions/exercise_2/exercise_2.1_solution.ipynb.txt create mode 100644 2024/_sources/exercise_solutions/exercise_2/exercise_2.2_solution.ipynb.txt create mode 100644 2024/_sources/exercise_solutions/exercise_2/exercise_2.3_solution.ipynb.txt create mode 100644 2024/_sources/exercise_solutions/exercise_2/exercise_2.4_solution.ipynb.txt create mode 100644 2024/_sources/exercise_solutions/exercise_2/index.rst.txt delete mode 100644 2024/_sources/lessons/bootcamp_live/Untitled.ipynb.txt create mode 100644 2024/_sources/lessons/merging_dataframes.ipynb.txt create mode 100644 2024/_sources/lessons/stacking_and_unstacking.ipynb.txt create mode 100644 2024/exercise_solutions/exercise_2/exercise_2.1_solution.html create mode 100644 2024/exercise_solutions/exercise_2/exercise_2.1_solution.ipynb create mode 100644 2024/exercise_solutions/exercise_2/exercise_2.2_solution.html create mode 100644 2024/exercise_solutions/exercise_2/exercise_2.2_solution.ipynb create mode 100644 2024/exercise_solutions/exercise_2/exercise_2.3_solution.html create mode 100644 2024/exercise_solutions/exercise_2/exercise_2.3_solution.ipynb rename 2024/{lessons/bootcamp_live/Untitled.html => exercise_solutions/exercise_2/exercise_2.4_solution.html} (91%) create mode 100644 2024/exercise_solutions/exercise_2/exercise_2.4_solution.ipynb create mode 100644 2024/exercise_solutions/exercise_2/index.html delete mode 100644 2024/lessons/bootcamp_live/Untitled.ipynb create mode 100644 2024/lessons/merging_dataframes.html create mode 100644 2024/lessons/merging_dataframes.ipynb create mode 100644 2024/lessons/stacking_and_unstacking.html create mode 100644 2024/lessons/stacking_and_unstacking.ipynb diff --git a/2024/_sources/exercise_solutions/exercise_2/exercise_2.1_solution.ipynb.txt b/2024/_sources/exercise_solutions/exercise_2/exercise_2.1_solution.ipynb.txt new file mode 100644 index 00000000..9d926c61 --- /dev/null +++ b/2024/_sources/exercise_solutions/exercise_2/exercise_2.1_solution.ipynb.txt @@ -0,0 +1,262 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exercise 2.1: Parsing a FASTA file\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are packages, like [Biopython](http://biopython.org/) and [scikit-bio](http://scikit-bio.org) for processing files you encounter in bioinformatics. In this problem, though, we will work on our file I/O skills. \n", + "\n", + "**a)** Use command line tools to investigate the [FASTA file](https://en.wikipedia.org/wiki/FASTA_format) located at `~/git/bootcamp/data/salmonella_spi1_region.fna`. This file contains a portion of the _Salmonella_ genome (described in [Exercise 4.1](exercise_2.3.ipynb)).\n", + "\n", + "You will notice that the first line begins with a `>`, signifying that the line contains information about the sequence. The remainder of the lines are the sequence itself.\n", + "\n", + "**b)** The format of the _Salmonella_ SPI1 region FASTA file is a common format for such files (though oftentimes FASTA files contain multiple sequences). Use the file I/O skills you have learned to write a function to read in a sequence from a FASTA file containing a single sequence (but possibly having the first line in the file beginning with `>`). Your function should take as input the name of the FASTA file and return two strings. First, it should return the descriptor string (which starts with `>`). Second, it should return a string with no gaps containing the sequence.\n", + "\n", + "Test your function on the _Salmonella_ sequence." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Solution\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**a)** A quick look using `less` indicates that this is a valid FASTA file. I did a quick check to see how many sequences there are by searching for the `>` symbol using `grep`." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">gi|821161554|gb|CP011428.1| Salmonella enterica subsp. enterica strain YU39, complete genome, subsequence 3000000 to 3200000\n" + ] + } + ], + "source": [ + "!grep \">\" data/salmonella_spi1_region.fna" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can do the same for the λ DNA file we will use in [Exercise 2.2](exercise_2.2.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">Lambda_NEB\n" + ] + } + ], + "source": [ + "!grep \">\" data/lambdaDNA.fasta" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see that in both there is a single record. So, when we read it in, we just need to skip the first line and read on until we get the full record." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**b)** We will read in the files line by line, saving the first line as the descriptor, and then building the sequence as we go along." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def read_fasta(filename):\n", + " \"\"\"Read a sequence in from a FASTA file containing a single sequence.\n", + " \n", + " We assume that the first line of the file is the descriptor and all\n", + " subsequent lines are sequence. \n", + " \"\"\"\n", + " with open(filename, \"r\") as f:\n", + " # Read in descriptor\n", + " descriptor = f.readline().rstrip()\n", + "\n", + " # Read in sequence, stripping the whitespace from each line\n", + " seq = \"\"\n", + " line = f.readline().rstrip()\n", + " while line != \"\":\n", + " seq += line\n", + " line = f.readline().rstrip()\n", + "\n", + " return descriptor, seq" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Of course, when writing this function, we should be taking a test-driven development approach, but here we are focusing on our file I/O and string handling skills.\n", + "\n", + "Let's take the function for a drive!" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'AAAACCTTAGTAACTGGACTGCTGGGATTTTTCAGCCTGGATACGCTGGTAGATCTCTTCACGATGGACAGAAACTTCTTTCGGGGCGTTCACGCCAATACGCACCTGGTTGCCCTTCACCCCTAAAACTGTCACGGTGACCTCATCGCCAATCATGAGGGTCTCACCAACTCGACGAGTCAGAATCAGCATTCTTTGCTCCTTGAAAGATTAAAAGAGTCGGGTCTCTCTGTATCCCGGCATTATCCATCATATAACGCCAAAAAGTAAGCGATGACAAACACCTTAGGTGTAAGCAGTCATGGCATTACATTCTGTTAAACCTAAGTTTAGCCGATATACAAAACTTCAACCTGACTTTATCGTTGTCGATAGCGTTGACGTAAACGCCGCAGCACGGGCTGCGGCGCCAACGAACGCTTATAATTATTGCAATTTTGCGCTGACCCAGCCTTGTACACTGGCTAACGCTGCAGGCAGAGCTGCCGCATCCGTACCAC'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "descriptor, seq = read_fasta(\"data/salmonella_spi1_region.fna\")\n", + "\n", + "# Take a look at the first 500 bases to make sure we got it right.\n", + "seq[:500]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looks good!\n", + "\n", + "And for the λ-DNA...." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGACAGGTGCTGAAAGCGAGGCTTTTTGGCCTCTGTCGTTTCCTTTCTCTGTTTTTGTCCGTGGAATGAACAATGGAAGTCAACAAAAAGCAGCTGGCTGACATTTTCGGTGCGAGTATCCGTACCATTCAGAACTGGCAGGAACAGGGAATGCCCGTTCTGCGAGGCGGTGGCAAGGGTAATGAGGTGCTTTATGACTCTGCCGCCGTCATAAAATGGTATGCCGAAAGGGATGCTGAAATTGAGAACGAAAAGCTGCGCCGGGAGGTTGAAGAACTGCGGCAGGCCAGCGAGGCAGATCTCCAGCCAGGAACTATTGAGTACGAACGCCATCGACTTACGCGTGCGCAGGCCGACGCACAGGAACTGAAGAATGCCAGAG'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "descriptor_lambda, seq_lambda = read_fasta(\"data/lambdaDNA.fasta\")\n", + "\n", + "# Take a look at the first 500 bases to make sure we got it right.\n", + "seq_lambda[:500]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Computing environment" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Python implementation: CPython\n", + "Python version : 3.11.3\n", + "IPython version : 8.12.0\n", + "\n", + "jupyterlab: 3.6.3\n", + "\n" + ] + } + ], + "source": [ + "%load_ext watermark\n", + "%watermark -v -p jupyterlab" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/2024/_sources/exercise_solutions/exercise_2/exercise_2.2_solution.ipynb.txt b/2024/_sources/exercise_solutions/exercise_2/exercise_2.2_solution.ipynb.txt new file mode 100644 index 00000000..736dac91 --- /dev/null +++ b/2024/_sources/exercise_solutions/exercise_2/exercise_2.2_solution.ipynb.txt @@ -0,0 +1,341 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exercise 2.2: Restriction enzyme cut sites\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**[Restriction enzymes](https://en.wikipedia.org/wiki/Restriction_enzyme)** cut DNA at specific locations called **restriction sites**. The sequence at a restriction site is called a **recognition sequence**. Here are the recognition sequences of some commonly used restriction enzymes.\n", + "\n", + "|Restriction enzyme | Recognition sequence|\n", + "|:----|:----|\n", + "|[HindIII](https://en.wikipedia.org/wiki/HindIII) | `AAGCTT` |\n", + "|[EcoRI](https://en.wikipedia.org/wiki/EcoRI)| `GAATTC` |\n", + "|KpnI| `GGTACC` |\n", + "\n", + "\n", + "**a)** [New England Biosystems](https://www.neb.com/products/n3011-lambda-dna#Product%20Information) sells purified DNA of the genome of λ-phage, a bacteriophage that infect _E. coli_. You can download the FASTA file containing the sequence [here](https://www.neb.com/-/media/nebus/page-images/tools-and-resources/interactive-tools/dna-sequences-and-maps/text-documents/lambdafsa.txt). Use the function you wrote in [Exercise 2.1](exercise_2.1.ipynb) to extract the sequence.\n", + "\n", + "**b)** Write a function with call signature\n", + "\n", + "```python\n", + "restriction_sites(seq, recog_seq)\n", + "```\n", + "\n", + "that takes as arguments a sequence and the recognition sequence of a restriction enzyme sites and returns the indices of the first base of each of the restriction sites in the sequence. Use this function to find the indices of the restriction sites of λ-DNA for HindIII, EcoRI, and KpnI. Compare your results with those given [here](https://www.bioinformatics.nl/molbi/SCLResources/LambdaBE_restrct_alphab.htm), which contain a comprehensive list of locations of restriction sites for a variety of enzymes.\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Solution\n", + "\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import re" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**a)** I have downloaded the FASTA file to `data/lambdaDNA.fasta`. Let's load it in." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def read_fasta(filename):\n", + " \"\"\"Read a sequence in from a FASTA file containing a single sequence.\n", + " \n", + " We assume that the first line of the file is the descriptor and all\n", + " subsequent lines are sequence. \n", + " \"\"\"\n", + " with open(filename, 'r') as f:\n", + " # Read in descriptor\n", + " descriptor = f.readline().rstrip()\n", + "\n", + " # Read in sequence, stripping the whitespace from each line\n", + " seq = ''\n", + " line = f.readline().rstrip()\n", + " while line != '':\n", + " seq += line\n", + " line = f.readline().rstrip()\n", + " \n", + " return descriptor, seq\n", + "\n", + "\n", + "_, seq = read_fasta('data/lambdaDNA.fasta')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can check to make sure we got the whole sequence, which should be about 48.5 kbp." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "48502" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(seq)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looks good!\n", + "\n", + "**b)** Our goal is to find all locations of the substring given by the recognition sequence in the genome. This is most easily accomplished using the `re` module that uses **[regular expressions](https://en.wikipedia.org/wiki/Regular_expression)**, which we will cover in an auxiliary lesson. Specifically, we use `re.finditer()` to automatically find all occurrences of the recognition sequence in the sequence, and then use the `start()` method to get the first index." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def restriction_sites_with_re(seq, recog_seq):\n", + " \"\"\"Find the indices of all restriction sites in a sequence.\"\"\"\n", + " sites = []\n", + " for site in re.finditer(recog_seq, seq):\n", + " sites.append(site.start())\n", + " \n", + " return sites" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's check our restriction sites against the know sites." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HindIII: [23129, 25156, 27478, 36894, 37458, 37583, 44140]\n", + "EcoRI: [21225, 26103, 31746, 39167, 44971]\n", + "KpnI: [17052, 18555]\n" + ] + } + ], + "source": [ + "print('HindIII:', restriction_sites_with_re(seq, 'AAGCTT'))\n", + "print('EcoRI: ', restriction_sites_with_re(seq, 'GAATTC'))\n", + "print('KpnI: ', restriction_sites_with_re(seq, 'GGTACC'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Mostly, these match exactly the listed values, except each index reported by our function is one less than the listed values because we are indexing starting at zero. The one example that is different is that the file we are comparing to does not contain the cut site at index 37583. According to the [restriction map from NEB](https://www.neb.com/-/media/nebus/page-images/tools-and-resources/interactive-tools/dna-sequences-and-maps/lambda_map.pdf), NEB sells λ DNA that is a derivative of the strain λ cI857ind1 Sam7. This particular strain has a point mutation in the cI gene. Coincidentally, the point mutation results in an extra HindIII site at 37583. Note that this site is very close to the site at 37458, which means that you might not see the resulting short fragment of DNA between those two sites in a gel, since you might run that fragment off the gel.\n", + "\n", + "I will now write a function do to the same calculation without using regular expressions. We will simply make a pass through the sequence and store the indices where we match the recognition sequence." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def restriction_sites(seq, recog_seq):\n", + " \"\"\"Find the indices of all restriction sites in a sequence.\"\"\"\n", + " # Initialize list of restriction sites\n", + " sites = []\n", + "\n", + " # Check every substring for a match\n", + " for i in range(len(seq) - len(recog_seq)):\n", + " if seq[i:i+len(recog_seq)] == recog_seq:\n", + " sites.append(i)\n", + " \n", + " return sites" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And let's use this function." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HindIII: [23129, 25156, 27478, 36894, 37458, 37583, 44140]\n", + "EcoRI: [21225, 26103, 31746, 39167, 44971]\n", + "KpnI: [17052, 18555]\n" + ] + } + ], + "source": [ + "print('HindIII:', restriction_sites(seq, 'AAGCTT'))\n", + "print('EcoRI: ', restriction_sites(seq, 'GAATTC'))\n", + "print('KpnI: ', restriction_sites(seq, 'GGTACC'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For fun, we can compare the speeds of the two functions using the magic function `%timeit`. This function performs a calculation many times and computes a mean execution time. It can be used to check speed of your functions." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "6.4 ms ± 21.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "151 µs ± 646 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n" + ] + } + ], + "source": [ + "%timeit restriction_sites(seq, 'AAGCTT')\n", + "%timeit restriction_sites_with_re(seq, 'AAGCTT')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The regular expression-based method is nearly 100 times faster! This is often the case; hand-coding something in pure Python can be slow compared to using packages that use pre-compiled code like `re`. However, as [Donald Knuth](https://en.wikipedia.org/wiki/Donald_Knuth) said, \"The real problem is that programmers have spent far too much time worrying about efficiency in the wrong places and at the wrong times; **premature optimization is the root of all evil (or at least most of it) in programming.**\" Get your code working, even if it is slow, and optimize only if you have to." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Computing environment" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Python implementation: CPython\n", + "Python version : 3.11.3\n", + "IPython version : 8.12.0\n", + "\n", + "re : 2.2.1\n", + "jupyterlab: 3.6.3\n", + "\n" + ] + } + ], + "source": [ + "%load_ext watermark\n", + "%watermark -v -p re,jupyterlab" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/2024/_sources/exercise_solutions/exercise_2/exercise_2.3_solution.ipynb.txt b/2024/_sources/exercise_solutions/exercise_2/exercise_2.3_solution.ipynb.txt new file mode 100644 index 00000000..f514ad71 --- /dev/null +++ b/2024/_sources/exercise_solutions/exercise_2/exercise_2.3_solution.ipynb.txt @@ -0,0 +1,740 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exercise 2.3: Pathogenicity islands\n", + "\n", + "This exercise was inspired by [Libeskind-Hadas and Bush, *Computing for Biologists*, Cambridge University Press, 2014](https://www.cs.hmc.edu/CFB).\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For this and the [next problem](exercise_2.4.ipynb), we will work with real data from the *Salmonella enterica* genome. The section of the genome we will work with is in the file `~/git/bootcamp/data/salmonella_spi1_region.fna`. I cut it out of the [full genome](http://www.ncbi.nlm.nih.gov/nucleotide/821161554). It contains *Salmonella* pathogenicity island I (SPI1), which contains genes for surface receptors for host-pathogen interactions.\n", + "\n", + "Pathogenicity islands are often marked by different GC content than the rest of the genome. We will try to locate the pathogenicity island(s) in our section of the *Salmonella* genome by computing GC content.\n", + "\n", + "**a)** Write a function that divides a sequence into blocks and computes the GC content for each block, returning a tuple. The function signature should look like\n", + "\n", + " gc_blocks(seq, block_size)\n", + " \n", + "To be clear, if `seq = 'ATGACTACGT'` and `block_size = 4`, the blocks to be considered are\n", + "\n", + " ATGA\n", + " CTAC\n", + " \n", + "and the function should return `(0.25, 0.5)`. Note that the blocks are non-overlapping and that we don't bother with the fact that end of the sequence that does not fit completely in a block." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**b)** Write a function that takes as input a sequence, block size, and a threshold GC content, and returns the original sequence where every base in a block with GC content above threshold is capitalized and every base below the threshold is lowercase. You would call the function like this:\n", + "\n", + " mapped_seq = gc_map(seq, block_size, gc_thresh)\n", + "\n", + "For example, \n", + "\n", + " gc_map('ATGACTACGT', 4, 0.4)\n", + "\n", + "returns `'atgaCTAC'`. Note that bases not included in GC blocks (in this case the `GT` at the end of the sequence) are not included in the output sequence." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**c)** Use the `gc_map()` function to generate a GC content map for the *Salmonella* sequence with `block_size = 1000` and `gc_thresh = 0.45`. Where do you think the pathogenicity island is?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**d)** Write the GC-mapped sequence (with upper and lower characters) to a new FASTA file. Use the same description line (which began with a `>` in the original FASTA file), and have line breaks every 60 characters in the sequence." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Solution\n", + "\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**a)** In my approach to this problem, I will employ **test-driven development** (TDD). The idea is that you write tests for your functions _first_ and then develop your functions around passing tests. The suite of tests for a function grows with bug fixes and feature additions. \n", + "\n", + "So, first let's write our tests. In writing the tests, I am making the design decision that I will count the characters `G`, `g`, `C`, and `c` as contributing to GC content, and that I will not check to make sure the sequence is valid. I also make the design decision that an empty sequence has zero GC content." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def test_gc_content():\n", + " assert gc_content('') == 0.0\n", + " assert gc_content('G') == 1.0\n", + " assert gc_content('g') == 1.0\n", + " assert gc_content('C') == 1.0\n", + " assert gc_content('c') == 1.0\n", + " assert gc_content('gcgcgc') == 1.0\n", + " assert gc_content('aaatatata') == 0.0\n", + " assert np.isclose(gc_content('ggatcggcga'), 0.7)\n", + " assert np.isclose(gc_content('attgggggcaatta'), 3/7)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The function is fairly simple. We loop through the sequence with a stride equal to the block size, computing the GC content for each subsequence of that length. We start with a function to compute GC content for a sequence." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def gc_content(seq):\n", + " \"\"\"GC content of a given sequence\"\"\"\n", + " if seq == '':\n", + " return 0.0\n", + " \n", + " seq = seq.upper()\n", + " return (seq.count('G') + seq.count('C')) / len(seq)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's test it." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "test_gc_content()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Passage! Next, we write the looping function, starting with its tests." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def test_gc_blocks():\n", + " assert gc_blocks('', 10) == tuple()\n", + " assert gc_blocks('gcgcgcgcg', 10) == tuple()\n", + " assert gc_blocks('gcgcgcg', 4) == (1.0,)\n", + " assert gc_blocks('gcgcgcgc', 4) == (1.0, 1.0)\n", + " assert gc_blocks('gcgcgcgcat', 4) == (1.0, 1.0)\n", + "\n", + " test_tuple = gc_blocks('gcgagcgcat', 4)\n", + " assert np.isclose(test_tuple[0], 0.75) and test_tuple[1] == 1.0\n", + " \n", + " assert gc_blocks('gcgtagagc', 1) == (1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0)\n", + " assert gc_blocks('gcgtagagc', 2) == (1.0, 0.5, 0.5, 0.5)\n", + " assert np.isclose(gc_blocks('gcgtagagc', 3), (1.0, 1/3, 2/3)).sum() == 3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's write our function." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def gc_blocks(seq, block_size):\n", + " \"\"\"\n", + " Divide sequence into non-overlapping blocks\n", + " and compute GC content of each block.\n", + " \"\"\"\n", + " blocks = []\n", + " for i in range(0, len(seq) - (len(seq) % block_size), block_size):\n", + " blocks.append(gc_content(seq[i:i+block_size]))\n", + " return tuple(blocks)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And the tests...." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "test_gc_blocks()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Success! Let's take this function for a spin, looking at 1000-base blocks. We will use the FASTA reader function from a previous exercise to read in the _Salmonella_ genome fragment." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(0.521,\n", + " 0.556,\n", + " 0.54,\n", + " 0.498,\n", + " 0.551,\n", + " 0.508,\n", + " 0.563,\n", + " 0.484,\n", + " 0.58,\n", + " 0.557,\n", + " 0.523,\n", + " 0.524,\n", + " 0.621,\n", + " 0.556,\n", + " 0.481,\n", + " 0.57,\n", + " 0.581,\n", + " 0.614,\n", + " 0.603,\n", + " 0.526,\n", + " 0.524,\n", + " 0.591,\n", + " 0.563,\n", + " 0.596,\n", + " 0.563,\n", + " 0.6,\n", + " 0.613,\n", + " 0.594,\n", + " 0.486,\n", + " 0.554,\n", + " 0.566,\n", + " 0.592,\n", + " 0.563,\n", + " 0.537,\n", + " 0.575,\n", + " 0.501,\n", + " 0.54,\n", + " 0.555,\n", + " 0.487,\n", + " 0.416,\n", + " 0.423,\n", + " 0.371,\n", + " 0.394,\n", + " 0.48,\n", + " 0.454,\n", + " 0.474,\n", + " 0.434,\n", + " 0.396,\n", + " 0.37,\n", + " 0.456,\n", + " 0.409,\n", + " 0.457,\n", + " 0.4,\n", + " 0.405,\n", + " 0.475,\n", + " 0.47,\n", + " 0.479,\n", + " 0.494,\n", + " 0.497,\n", + " 0.516,\n", + " 0.444,\n", + " 0.433,\n", + " 0.471,\n", + " 0.458,\n", + " 0.53,\n", + " 0.458,\n", + " 0.56,\n", + " 0.427,\n", + " 0.47,\n", + " 0.438,\n", + " 0.465,\n", + " 0.473,\n", + " 0.46,\n", + " 0.399,\n", + " 0.426,\n", + " 0.359,\n", + " 0.469,\n", + " 0.433,\n", + " 0.425,\n", + " 0.504,\n", + " 0.578,\n", + " 0.576,\n", + " 0.553,\n", + " 0.531,\n", + " 0.57,\n", + " 0.599,\n", + " 0.562,\n", + " 0.555,\n", + " 0.595,\n", + " 0.586,\n", + " 0.55,\n", + " 0.56,\n", + " 0.545,\n", + " 0.553,\n", + " 0.537,\n", + " 0.519,\n", + " 0.519,\n", + " 0.567,\n", + " 0.551,\n", + " 0.548,\n", + " 0.559,\n", + " 0.527,\n", + " 0.559,\n", + " 0.529,\n", + " 0.49,\n", + " 0.533,\n", + " 0.58,\n", + " 0.545,\n", + " 0.558,\n", + " 0.575,\n", + " 0.555,\n", + " 0.49,\n", + " 0.567,\n", + " 0.515,\n", + " 0.518,\n", + " 0.485,\n", + " 0.38,\n", + " 0.461,\n", + " 0.568,\n", + " 0.575,\n", + " 0.567,\n", + " 0.57,\n", + " 0.472,\n", + " 0.513,\n", + " 0.582,\n", + " 0.476,\n", + " 0.505,\n", + " 0.524,\n", + " 0.51,\n", + " 0.512,\n", + " 0.391,\n", + " 0.463,\n", + " 0.57,\n", + " 0.546,\n", + " 0.535,\n", + " 0.525,\n", + " 0.525,\n", + " 0.529,\n", + " 0.58,\n", + " 0.555,\n", + " 0.558,\n", + " 0.563,\n", + " 0.525,\n", + " 0.505,\n", + " 0.557,\n", + " 0.554,\n", + " 0.484,\n", + " 0.525,\n", + " 0.567,\n", + " 0.467,\n", + " 0.527,\n", + " 0.55,\n", + " 0.577,\n", + " 0.554,\n", + " 0.538,\n", + " 0.429,\n", + " 0.507,\n", + " 0.557,\n", + " 0.592,\n", + " 0.595,\n", + " 0.554,\n", + " 0.521,\n", + " 0.539,\n", + " 0.521,\n", + " 0.45,\n", + " 0.608,\n", + " 0.489,\n", + " 0.477,\n", + " 0.552,\n", + " 0.508,\n", + " 0.544,\n", + " 0.495,\n", + " 0.543,\n", + " 0.56,\n", + " 0.596,\n", + " 0.547,\n", + " 0.581,\n", + " 0.548,\n", + " 0.537,\n", + " 0.529,\n", + " 0.513,\n", + " 0.499,\n", + " 0.545,\n", + " 0.567,\n", + " 0.52,\n", + " 0.545,\n", + " 0.548,\n", + " 0.522,\n", + " 0.533,\n", + " 0.558,\n", + " 0.586,\n", + " 0.469,\n", + " 0.516,\n", + " 0.509,\n", + " 0.511,\n", + " 0.569,\n", + " 0.575,\n", + " 0.559,\n", + " 0.545,\n", + " 0.502)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def read_fasta(filename):\n", + " \"\"\"Read a sequence in from a FASTA file containing a single sequence.\n", + " \n", + " We assume that the first line of the file is the descriptor and all\n", + " subsequent lines are sequence. \n", + " \"\"\"\n", + " with open(filename, 'r') as f:\n", + " # Read in descriptor\n", + " descriptor = f.readline().rstrip()\n", + "\n", + " # Read in sequence, stripping the whitespace from each line\n", + " seq = ''\n", + " line = f.readline().rstrip()\n", + " while line != '':\n", + " seq += line\n", + " line = f.readline().rstrip()\n", + " \n", + " return descriptor, seq\n", + "\n", + "\n", + "descriptor, seq = read_fasta('data/salmonella_spi1_region.fna')\n", + "\n", + "gc_blocks(seq, 1000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We get a tuple of GC content, which is hard to look at on screen, but this is useful for plotting GC content over the course of a sequence. We will learn how to plot later in the bootcamp." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**b)** We just use our already-written `gc_content()` function to decide how to modify the string of the sequence. First, the tests. We make the design decision that we will truncate the sequence if the 3'-most end is shorter than the block length." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def test_gc_map():\n", + " assert gc_map('', 10, 0.5) == ''\n", + " assert gc_map('ATATATATA', 4, 0.5) == 'atatatat'\n", + " assert gc_map('GCGCGCGCG', 4, 0.5) == 'GCGCGCGC'\n", + " assert gc_map('GATCGATCC', 4, 0.5) == 'GATCGATC'\n", + " assert gc_map('GATCGATCC', 4, 0.51) == 'gatcgatc'\n", + " assert gc_map('GATCGATCC', 3, 0.5) == 'gatCGATCC'\n", + " assert gc_map('GATCGATCC', 3, 0.75) == 'gatcgatcc'\n", + " assert gc_map('GATCGATCC', 3, 0.25) == 'GATCGATCC'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now the function...." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def gc_map(seq, block_size, gc_thresh):\n", + " \"\"\"Give back seq with lowercase letters where GC content is low.\"\"\" \n", + " out_seq = ''\n", + "\n", + " # Determine GC content of each block and change string accordingly\n", + " for i in range(0, len(seq) - (len(seq) % block_size), block_size):\n", + " if gc_content(seq[i:i+block_size]) < gc_thresh:\n", + " out_seq += seq[i:i+block_size].lower()\n", + " else:\n", + " out_seq += seq[i:i+block_size].upper()\n", + "\n", + " return out_seq" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And the tests." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "test_gc_map()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Passage! We can now use these functions to analyze sequences of interest." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**c)** Let's do it for *Salmonella*!" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sal_gcmap = gc_map(seq, 1000, 0.45)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To save on display space, we will not display the sequence here. Scrolling through the GC map file generated in the next part, the pathogenicity island appears to occur about a quarter of the way into this subsequence." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**d)** To write the file out, we use the fact that we conveniently kept the description text when we parsed the *Salmonella* FASTA file in the first place. We then just write the `sal_gcmap` string in blocks of 60. We have to make sure to get the last few bases as well." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Write the result\n", + "with open('salmonella_spi1_region_gc_map.fna', 'w') as f:\n", + " # Write description text\n", + " f.write(descriptor + '\\n')\n", + "\n", + " # Write sequence in blocks of 60\n", + " i = 0\n", + " while i < len(sal_gcmap) - 59:\n", + " f.write(sal_gcmap[i:i+60] + '\\n')\n", + " i += 60\n", + " \n", + " # Write last line\n", + " f.write(sal_gcmap[i:] + '\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll take a quick look to see it worked out ok." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">gi|821161554|gb|CP011428.1| Salmonella enterica subsp. enterica strain YU39, complete genome, subsequence 3000000 to 3200000\n", + "AAAACCTTAGTAACTGGACTGCTGGGATTTTTCAGCCTGGATACGCTGGTAGATCTCTTC\n", + "ACGATGGACAGAAACTTCTTTCGGGGCGTTCACGCCAATACGCACCTGGTTGCCCTTCAC\n", + "CCCTAAAACTGTCACGGTGACCTCATCGCCAATCATGAGGGTCTCACCAACTCGACGAGT\n", + "CAGAATCAGCATTCTTTGCTCCTTGAAAGATTAAAAGAGTCGGGTCTCTCTGTATCCCGG\n", + "CATTATCCATCATATAACGCCAAAAAGTAAGCGATGACAAACACCTTAGGTGTAAGCAGT\n", + "CATGGCATTACATTCTGTTAAACCTAAGTTTAGCCGATATACAAAACTTCAACCTGACTT\n", + "TATCGTTGTCGATAGCGTTGACGTAAACGCCGCAGCACGGGCTGCGGCGCCAACGAACGC\n", + "TTATAATTATTGCAATTTTGCGCTGACCCAGCCTTGTACACTGGCTAACGCTGCAGGCAG\n", + "AGCTGCCGCATCCGTACCACCGGCTTGCGCCATGTCCGGACGACCGCCACCCTTACCGCC\n", + "...\n", + "ACGCATTTCTCCCGTGCAGGTCACATTTGCCCGACACGGCGGGGCAAGAGGCTTGAACAG\n", + "ACGTTCATTTTCCGTAAAACTGGCGTAATGTAAGCGTTTACCCACTATAGGTATTATCAT\n", + "GGCGACCATAAAAGATGTAGCCCGACTGGCCGGTGTTTCAGTCGCCACCGTTTCTCGCGT\n", + "TATTAACGATTCGCCAAAAGCCAGCGAAGCGTCCCGGCTGGCGGTAACCAGCGCAATGGA\n", + "GTCCCTGAGCTATCACCCTAACGCCAACGCGCGCGCGCTGGCACAGCAGGCAACGGAAAC\n", + "CCTCGGTCTGGTGGTCGGCGACGTTTCCGATCCTTTTTTCGGCGCGATGGTGAAAGCCGT\n", + "TGAACAGGTGGCGTATCACACCGGCAATTTTTTACTGATTGGCAACGGGTATCATAACGA\n", + "ACAAAAAGAGCGTCAGGCTATTGAACAGTTGATTCGTCATCGTTGCGCAGCGTTAGTGGT\n", + "GCACGCCAAAATGATTCCGGATGCGGACCTGGCCTCATTAATGAAGCAAATCCCCGGCAT\n", + "GGTGCTGATTAACCGCATTT\n" + ] + } + ], + "source": [ + "!head salmonella_spi1_region_gc_map.fna\n", + "print('...')\n", + "!tail salmonella_spi1_region_gc_map.fna" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looks good!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Computing environment" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Python implementation: CPython\n", + "Python version : 3.11.3\n", + "IPython version : 8.12.0\n", + "\n", + "numpy : 1.24.3\n", + "jupyterlab: 3.6.3\n", + "\n" + ] + } + ], + "source": [ + "%load_ext watermark\n", + "%watermark -v -p numpy,jupyterlab" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/2024/_sources/exercise_solutions/exercise_2/exercise_2.4_solution.ipynb.txt b/2024/_sources/exercise_solutions/exercise_2/exercise_2.4_solution.ipynb.txt new file mode 100644 index 00000000..9e101251 --- /dev/null +++ b/2024/_sources/exercise_solutions/exercise_2/exercise_2.4_solution.ipynb.txt @@ -0,0 +1,864 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exercise 2.4: ORF detection\n", + "\n", + "This exercise was inspired by [Libeskind-Hadas and Bush, *Computing for Biologists*, Cambridge University Press, 2014](https://www.cs.hmc.edu/CFB).\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**a)** Write a function, `longest_orf()`, that takes a DNA sequence as input and finds the longest open reading frame (ORF) in the sequence (we will not consider reverse complements). A sequence fragment constitutes an ORF if the following are all true.\n", + "\n", + "1. It begins with `ATG`.\n", + "2. It ends with any of `TGA`, `TAG`, or `TAA`.\n", + "3. The total number of bases is a multiple of 3.\n", + "\n", + "Note that the sequence `ATG` may appear in the middle of an ORF. So, for example,\n", + "\n", + " GGATGATGATGTAAAAC\n", + "\n", + "has two ORFs, `ATGATGATGTAA` and `ATGATGTAA`. You would return the first one, since it is longer of these two.\n", + "\n", + "*Hint: The statement for this problem is a bit ambiguous as it is written. What other specification might you need for this function?*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**b)** Use your function to find the longest ORF from the section of the *Salmonella* genome we are investigating." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**c)** Write a function that converts a DNA sequence into a protein sequence. You can of course use the `bootcamp_utils` module." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**d)** Translate the longest ORF you generated in part (b) into a protein sequence and perform a [BLAST search](http://blast.ncbi.nlm.nih.gov/). Search for the protein sequence (a blastp query). What gene is it?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**e)** [*Bonus challenge*] Modify your function to return the `n` longest ORFs. Compute the five longest ORFs for the *Salmonella* genome section we are working with. Perform BLAST searches on them. What are they?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Solution\n", + "\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "import bootcamp_utils" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**a)** As in the last problem solution, I will again use principles of TDD.\n", + "\n", + "Something was missing in the specification. Namely, what do we do if there are two ORFs of the same longest length? Do we return the first one, second one, or both? I am arbitrarily choosing to return the one with the 3$'$-most starting index. \n", + "\n", + "Looking ahead to part (e), I will first write a function to return all ORFs that are not entirely included in a longer ORF. For ease of storage and comparison, I will simply store the ORFS as the index of the start of the ORF and the noninclusive index of the last.\n", + "\n", + "Let's now discuss the algorithm we'll use. There are more efficient ways of finding ORFs, but I will choose a very clear way. We'll first find all start codons. For each start codon, we will find the first in-register stop codon. If there is an in-register stop codon, we store this start-stop pair. At the end, we sort them, longest to shortest.\n", + "\n", + "So, we really have three functions we'll use. `find_all_starts(seq)` will return the indices of all start codons in a sequence. `find_next_in_register_stop(seq)` will scan a sequence that starts with `ATG` and return the exclusive final index of the next in register stop codon. In other words, and ORF starting at index `start` is given by\n", + "\n", + " seq[start : start + find_next_in_register_stop(seq[start :])]\n", + "\n", + "If there is no such codon, `find_next_in_register_stop()` returns `-1`. Finally, `all_orfs(seq)` returns the sorted tuple of 2-tuples containing the start/stop pairs of the ORFs.\n", + "\n", + "I will use TDD principles for designing these functions, writing the test cases first." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def test_find_all_starts():\n", + " assert find_all_starts(\"\") == tuple()\n", + " assert find_all_starts(\"GGAGACGACGCAAAAC\".lower()) == tuple()\n", + " assert find_all_starts(\"AAAAAAATGAAATGAGGGGGGTATG\".lower()) == (6, 11, 22)\n", + " assert find_all_starts(\"GGATGATGATGTAAAAC\".lower()) == (2, 5, 8)\n", + " assert find_all_starts(\"GGATGCATGATGTAGAAC\".lower()) == (2, 6, 9)\n", + " assert find_all_starts(\"GGGATGATGATGGGATGGTGAGTAGGGTAAG\".lower()) == (\n", + " 3,\n", + " 6,\n", + " 9,\n", + " 14,\n", + " )\n", + " assert find_all_starts(\"GGGatgatgatgGGatgGtgaGtagGGACtaaG\".lower()) == (\n", + " 3,\n", + " 6,\n", + " 9,\n", + " 14,\n", + " )\n", + "\n", + "\n", + "def test_find_first_in_register_stop():\n", + " assert find_first_in_register_stop(\"\") == -1\n", + " assert find_first_in_register_stop(\"GTAATAGTGA\".lower()) == -1\n", + " assert (\n", + " find_first_in_register_stop(\"AAAAAAAAAAAAAAATAAGGGTAA\".lower()) == 18\n", + " )\n", + " assert find_first_in_register_stop(\"AAAAAACACCGCGTGTACTGA\".lower()) == 21\n", + "\n", + "\n", + "def test_all_orfs():\n", + " assert all_orfs(\"\") == tuple()\n", + " assert all_orfs(\"GGAGACGACGCAAAAC\") == tuple()\n", + " assert all_orfs(\"AAAAAAATGAAATGAGGGGGGTATG\") == ((6, 15),)\n", + " assert all_orfs(\"GGATGATGATGTAAAAC\") == ((2, 14),)\n", + " assert all_orfs(\"GGATGCATGATGTAGAAC\") == ((6, 15),)\n", + " assert all_orfs(\"GGGATGATGATGGGATGGTGAGTAGGGTAAG\") == ((3, 21),)\n", + " assert all_orfs(\"GGGatgatgatgGGatgGtgaGtagGGACtaaG\") == ((14, 32), (3, 21))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll start with the `find_all_starts()` function." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def find_all_starts(seq):\n", + " \"\"\"Find all start codons in sequence\"\"\"\n", + " return None" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And now we'll fail the test." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[4], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m test_find_all_starts()\n", + "Cell \u001b[0;32mIn[2], line 2\u001b[0m, in \u001b[0;36mtest_find_all_starts\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mtest_find_all_starts\u001b[39m():\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m find_all_starts(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mtuple\u001b[39m()\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m find_all_starts(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGGAGACGACGCAAAAC\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mlower()) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mtuple\u001b[39m()\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m find_all_starts(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAAAAAAATGAAATGAGGGGGGTATG\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mlower()) \u001b[38;5;241m==\u001b[39m (\u001b[38;5;241m6\u001b[39m, \u001b[38;5;241m11\u001b[39m, \u001b[38;5;241m22\u001b[39m)\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "test_find_all_starts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we'll write the function. I'll use regular expressions first, but will also code up the function without them." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def find_all_starts(seq):\n", + " \"\"\"Find the starting index of all start codons in a lowercase seq\"\"\"\n", + " # Compile regex for start codons\n", + " regex_start = re.compile('atg')\n", + " \n", + " # Find the indices of all start codons\n", + " starts = []\n", + " for match in regex_start.finditer(seq):\n", + " starts.append(match.start())\n", + " \n", + " return tuple(starts)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And let's see if it passes the tests." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "test_find_all_starts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Yay! We passed! However, since we did not learn regular expressions this year, I will write a function that finds all start codons that does not use them." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def find_all_starts(seq):\n", + " \"\"\"Find the starting index of all start codons in a lowercase seq\"\"\"\n", + " # Initialize array of indices of start codons\n", + " starts = []\n", + " \n", + " # Find index of first start codon (remember, find() returns -1 if not found)\n", + " i = seq.find('atg')\n", + " \n", + " # Keep looking for subsequence incrementing starting point of search\n", + " while i >= 0:\n", + " starts.append(i)\n", + " i = seq.find('atg', i + 1)\n", + " \n", + " return tuple(starts)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's test this new `find_all_starts()` function" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "test_find_all_starts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It passed! Yay! Note how useful TDD is here. Whenever we try new ways of doing things, we can use the tests to make sure we didn't break anything.\n", + "\n", + "Now, let's move on to the next function, which finds the first in-register stop codon. Again, we fail, and then write the function." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[9], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfind_first_in_register_stop\u001b[39m(seq):\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m test_find_first_in_register_stop()\n", + "Cell \u001b[0;32mIn[2], line 22\u001b[0m, in \u001b[0;36mtest_find_first_in_register_stop\u001b[0;34m()\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mtest_find_first_in_register_stop\u001b[39m():\n\u001b[0;32m---> 22\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m find_first_in_register_stop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m find_first_in_register_stop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGTAATAGTGA\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mlower()) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m (\n\u001b[1;32m 25\u001b[0m find_first_in_register_stop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAAAAAAAAAAAAAAATAAGGGTAA\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mlower()) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m18\u001b[39m\n\u001b[1;32m 26\u001b[0m )\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "def find_first_in_register_stop(seq):\n", + " return None\n", + "\n", + "test_find_first_in_register_stop()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Nice, beautiful failure. Now, we'll write the function and test it. Again, I'll demonstrate the power of the `re` module." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def find_first_in_register_stop(seq):\n", + " \"\"\"\n", + " Find first stop codon on lowercase seq that starts at an index\n", + " that is divisible by three\n", + " \"\"\"\n", + " # Compile regexes for stop codons\n", + " regex_stop = re.compile('(taa|tag|tga)')\n", + " \n", + " # Stop codon iterator\n", + " stop_iterator = regex_stop.finditer(seq)\n", + "\n", + " # Find next stop codon that is in register\n", + " for stop in stop_iterator:\n", + " if stop.end() % 3 == 0:\n", + " return stop.end()\n", + " \n", + " # Return -1 if we failed to find a stop codon\n", + " return -1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And the test..." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "test_find_first_in_register_stop()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Great! It passes. Now, I'll write it without regular expressions." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def find_first_in_register_stop(seq):\n", + " \"\"\"\n", + " Find first stop codon on seq that starts at an index\n", + " that is divisible by three\n", + " \"\"\"\n", + "\n", + " seq = seq.lower()\n", + "\n", + " # Scan sequence for stop codon\n", + " i = 0\n", + " while i < len(seq) - 2 and seq[i:i+3] not in ('taa', 'tag', 'tga'):\n", + " i += 3\n", + "\n", + " # If before end, found codon, return end of codon\n", + " if i < len(seq) - 2:\n", + " return i + 3\n", + " else: # Failed to find stop codon\n", + " return -1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's test this function to make sure it works." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "test_find_first_in_register_stop()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Passage! Finally, we apply TDD to write `all_orfs()`." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def all_orfs(seq):\n", + " \"\"\"Return all ORFs of a sequence.\"\"\"\n", + " # Make sure sequence is all lower case\n", + " seq = seq.lower()\n", + " \n", + " # Find the indices of all start codons\n", + " start_inds = find_all_starts(seq)\n", + " \n", + " # Keep track of stops\n", + " stop_inds = []\n", + " \n", + " # Initialze ORFs. Each entry in list is [ORF length, ORF start, ORF stop]\n", + " orfs = []\n", + " \n", + " # For each start codon, find the next stop codon in register\n", + " for start in start_inds:\n", + " relative_stop = find_first_in_register_stop(seq[start:])\n", + " \n", + " if relative_stop != -1:\n", + " # Index of stop codon\n", + " stop = start + relative_stop\n", + " \n", + " # If already had stop, a longer ORF contains this one\n", + " if stop not in stop_inds:\n", + " orfs.append((relative_stop, start, stop))\n", + " stop_inds.append(stop)\n", + " \n", + " # Get sorted list of ORF length\n", + " orfs = sorted(orfs, reverse=True)\n", + " \n", + " # Remove lengths\n", + " for i, orf in enumerate(orfs):\n", + " orfs[i] = (orf[1], orf[2])\n", + " \n", + " return tuple(orfs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now for the tests...." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "test_all_orfs()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Passage! We have succeed in generating an ordered list of the ORFs. Now, let's get what the problem specified, the longest ORF. Of course, we start with writing tests." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "def test_longest_orf():\n", + " assert longest_orf(\"\") == \"\"\n", + " assert longest_orf(\"GGAGACGACGCAAAAC\") == \"\"\n", + " assert longest_orf(\"AAAAAAATGAAATGAGGGGGGTATG\") == \"ATGAAATGA\"\n", + " assert longest_orf(\"GGATGATGATGTAAAAC\") == \"ATGATGATGTAA\"\n", + " assert longest_orf(\"GGATGCATGATGTAGAAC\") == \"ATGATGTAG\"\n", + " assert longest_orf(\"GGGATGATGATGGGATGGTGAGTAGGGTAAG\") == \"ATGATGATGGGATGGTGA\"\n", + " assert longest_orf(\"GGGatgatgatgGGatgGtgaGtagGGACtaaG\") == \"atgGtgaGtagGGACtaa\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll fail them...." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[17], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mlongest_orf\u001b[39m(seq):\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m test_longest_orf()\n", + "Cell \u001b[0;32mIn[16], line 2\u001b[0m, in \u001b[0;36mtest_longest_orf\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mtest_longest_orf\u001b[39m():\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m longest_orf(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m longest_orf(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGGAGACGACGCAAAAC\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m longest_orf(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAAAAAAATGAAATGAGGGGGGTATG\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mATGAAATGA\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "def longest_orf(seq):\n", + " return None\n", + "\n", + "test_longest_orf()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll write our function, and then test it." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "def longest_orf(seq):\n", + " \"\"\"Longest ORF of a sequence.\"\"\"\n", + " orfs = all_orfs(seq)\n", + " \n", + " if len(orfs) == 0:\n", + " return ''\n", + " else:\n", + " return seq[orfs[0][0]:orfs[0][1]]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "test_longest_orf()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Passage! Success! We now have a reliable function for computing the longest ORF." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**b)** We simply use our new function to find the longest ORF of the *Salmonella* sequence." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'ATGACCAACTACAGCCTGCGCGCACGCATGATGATTCTGATCCTGGCCCCGACCGTCCTGATAGGTTTGCTGCTCAGTATCTTTTTTGTAGTGCACCGCTATAACGACCTGCAGCGTCAACTGGAAGATGCCGGCGCCAGTATTATTGAACCGCTCGCCGTCTCCAGCGAATATGGTATGAACTTACAAAACCGGGAGTCTATCGGCCAACTTATCAGCGTCCTGCACCGCAGACACTCTGATATTGTGCGGGCGATTTCCGTTTATGACGATCATAACCGTCTGTTTGTAACGTCTAATTTCCATCTGGACCCCTCACAGATGCAGCTTCCCGCCGGAGCGCCGTTTCCACGTCGTCTGAGCGTTGATCGCCACGGCGATATTATGATTCTGCGCACCCCAATTATCTCGGAGAGCTATTCGCCGGACGAGTCAGCGATTGCTGACGCGAAAAATACCAAAAATATGCTGGGGTATGTGGCGCTTGAACTGGATCTCAAGTCGGTCAGGCTACAGCAATACAAAGAGATTTTTATCTCCAGCGTGATGATGCTTTTTTGTATTGGCATTGCGCTGATCTTTGGCTGGCGGCTTATGCGCGATGTCACCGGGCCTATCCGTAATATGGTGAATACCGTTGACCGCATTCGCCGCGGACAACTGGATAGCCGGGTGGAAGGATTTATGCTGGGCGAACTGGATATGCTGAAAAACGGCATTAATTCCATGGCGATGTCGCTTGCCGCCTATCACGAAGAGATGCAGCATAATATCGATCAGGCCACTTCGGACCTGCGTGAAACCCTTGAGCAGATGGAAATCCAAAACGTTGAGCTGGATCTGGCGAAAAAGCGTGCCCAGGAAGCGGCGCGTATTAAGTCGGAGTTCCTGGCGAACATGTCGCACGAACTGCGAACGCCGCTGAACGGCGTCATTGGCTTTACCCGCCTGACATTAAAAACGGAGCTGAATCCCACCCAGCGCGACCATCTGAACACCATTGAGCGTTCCGCGAATAATCTGCTGGCGATCATTAATGACGTGCTTGATTTCTCCAAGCTGGAAGCCGGTAAGCTCATTCTGGAAAGTATCCCTTTTCCACTGCGTAATACGCTGGATGAAGTGGTTACGCTGCTGGCTCACTCGTCGCATGATAAAGGGCTGGAGTTGACGTTAAATATTAAAAACGACGTCCCGGATAATGTGATTGGCGACCCGCTGCGCCTGCAACAGGTCATTACTAATCTGGTGGGTAATGCCATTAAGTTCACCGAGAGTGGCAATATCGACATTCTGGTAGAAAAGCGGGCGCTCAGTAACACCAAAGTACAGATTGAAGTGCAGATCCGCGATACGGGGATCGGCATTCCGGAACGCGACCAGTCGCGACTGTTTCAGGCGTTTCGCCAGGCCGATGCCAGTATTTCTCGCCGTCACGGCGGCACCGGGCTTGGGCTGGTGATTACGCAAAAGCTGGTCAACGAAATGGGCGGGGATATCTCTTTCCACAGCCAGCCTAATCGCGGCTCGACCTTCTGGTTTCATATTAATCTTGATCTTAACCCAAATGTCATTATTGACGGGCCGTCGACCGCGTGTCTGGCCGGGAAACGGCTGGCTTATGTCGAACCGAATGCTACCGCCGCGCAATGTACCCTGGATCTACTGAGCGACACGCCGGTGGAGGTGGTTTACAGCCCGACCTTCTCCGCGCTGCCGTTAGCGCACTACGATATTATGATTTTGAGCGTTCCGGTGACCTTCCGCGAGCCGCTCACCATGCAGCATGAACGTCTGGCGAAAGCAGCGTCAATGACGGACTTTCTACTGCTGGCGCTACCTTGCCATGCGCAAATTAACGCCGAAAAGCTGAAACAAGGAGGCGCGGCGGCCTGTCTGTTAAAACCATTGACGTCAACGCGCCTGTTGCCAGCGCTGACGGAATATTGCCAGTTGAATCACCATCCTGAACCGCTGCTAATGGATACCAGTAAAATCACCATGACGGTTATGGCGGTTGATGATAATCCCGCTAATCTGAAGCTTATCGGCGCGTTACTGGAAGATAAAGTCCAGCACGTAGAGCTTTGTGATAGCGGACATCAGGCGGTGGATCGGGCGAAACAAATGCAGTTTGATCTGATTTTGATGGATATTCAGATGCCGGATATGGACGGCATACGCGCCTGCGAATTGATTCACCAGCTTCCTCATCAGCAGCAAACACCGGTTATTGCCGTTACGGCACATGCGATGGCCGGGCAAAAAGAGAAGTTGCTCAGCGCGGGCATGAACGACTATCTGGCTAAACCGATAGAAGAAGAGAAGTTGCATAATCTGTTGCTGCGCTATAAACCTGGCGCCAACGTAGCAGCGCGCCTGATGGCGCCGGAACCAGCTGAATTTATCTTCAATCCGAATGCAACGCTCGACTGGCAGCTTGCGCTCCGCCAGGCTGCCGGTAAGCCCGATCTGGCGCGGGATATGCTGCAAATGCTGATTGATTTTCTGCCGGAAGTGCGCAACAAAATTGAAGAACAACTGGTGGGAGAAAATCCCAACGGCCTGGTCGATCTGGTCCATAAGCTACACGGGAGCTGCGGCTATAGCGGCGTACCGCGGATGAAGAACCTTTGCCAGCTTATTGAGCAACAGCTTCGCAGCGGCGTCCACGAAGAGGAGCTGGAGCCTGAGTTTCTGGAGCTGCTGGATGAGATGGATAATGTCGCGCGTGAAGCGAAGAAGATATTAGGCTGA'" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def read_fasta(filename):\n", + " \"\"\"Read a sequence in from a FASTA file containing a single sequence.\n", + " \n", + " We assume that the first line of the file is the descriptor and all\n", + " subsequent lines are sequence. \n", + " \"\"\"\n", + " with open(filename, 'r') as f:\n", + " # Read in descriptor\n", + " descriptor = f.readline().rstrip()\n", + "\n", + " # Read in sequence, stripping the whitespace from each line\n", + " seq = ''\n", + " line = f.readline().rstrip()\n", + " while line != '':\n", + " seq += line\n", + " line = f.readline().rstrip()\n", + " \n", + " return descriptor, seq\n", + "\n", + "\n", + "# Load in Salmonella sequence\n", + "descriptor, seq = read_fasta('data/salmonella_spi1_region.fna')\n", + "\n", + "# Compute it\n", + "salmonella_orf = longest_orf(seq)\n", + "\n", + "# Look at it\n", + "salmonella_orf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**c)** We can use the `codons` dictionary in the `bootcamp_utils` package to do the translation. With this in hand, we can write our `translate()` function. We will scan the DNA sequence, generate a list of amino acids, and then join them into a protein sequence." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "def translate(seq):\n", + " \"\"\"Translate a DNA sequence into protein\"\"\"\n", + " # Make sure sequence is upper case\n", + " seq = seq.upper()\n", + " \n", + " # Find start codon\n", + " i = 0\n", + " while i < len(seq) + 2 and seq[i:i+3] != 'ATG':\n", + " i += 1\n", + "\n", + " # Translate until the stop codon or end of string\n", + " prot = []\n", + " while i < len(seq) - 2 and seq[i:i+3] not in ('TAA', 'TGA', 'TAG'):\n", + " prot.append(bootcamp_utils.codons[seq[i:i+3]])\n", + " i += 3\n", + "\n", + " return ''.join(prot)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**d)** We can now translate the protein" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'MTNYSLRARMMILILAPTVLIGLLLSIFFVVHRYNDLQRQLEDAGASIIEPLAVSSEYGMNLQNRESIGQLISVLHRRHSDIVRAISVYDDHNRLFVTSNFHLDPSQMQLPAGAPFPRRLSVDRHGDIMILRTPIISESYSPDESAIADAKNTKNMLGYVALELDLKSVRLQQYKEIFISSVMMLFCIGIALIFGWRLMRDVTGPIRNMVNTVDRIRRGQLDSRVEGFMLGELDMLKNGINSMAMSLAAYHEEMQHNIDQATSDLRETLEQMEIQNVELDLAKKRAQEAARIKSEFLANMSHELRTPLNGVIGFTRLTLKTELNPTQRDHLNTIERSANNLLAIINDVLDFSKLEAGKLILESIPFPLRNTLDEVVTLLAHSSHDKGLELTLNIKNDVPDNVIGDPLRLQQVITNLVGNAIKFTESGNIDILVEKRALSNTKVQIEVQIRDTGIGIPERDQSRLFQAFRQADASISRRHGGTGLGLVITQKLVNEMGGDISFHSQPNRGSTFWFHINLDLNPNVIIDGPSTACLAGKRLAYVEPNATAAQCTLDLLSDTPVEVVYSPTFSALPLAHYDIMILSVPVTFREPLTMQHERLAKAASMTDFLLLALPCHAQINAEKLKQGGAAACLLKPLTSTRLLPALTEYCQLNHHPEPLLMDTSKITMTVMAVDDNPANLKLIGALLEDKVQHVELCDSGHQAVDRAKQMQFDLILMDIQMPDMDGIRACELIHQLPHQQQTPVIAVTAHAMAGQKEKLLSAGMNDYLAKPIEEEKLHNLLLRYKPGANVAARLMAPEPAEFIFNPNATLDWQLALRQAAGKPDLARDMLQMLIDFLPEVRNKIEEQLVGENPNGLVDLVHKLHGSCGYSGVPRMKNLCQLIEQQLRSGVHEEELEPEFLELLDEMDNVAREAKKILG'" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "translate(salmonella_orf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Doing a BLAST search on this protein indicates that it is a histidine kinase involved in signaling." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**e)** We already are computing all of the ORFs. We an therefore just add a kwarg to our `longest_orf()` function to get the `n` longest ones." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "def longest_orf(seq, n=1):\n", + " \"\"\"Longest ORF of a sequence.\"\"\"\n", + " orfs = all_orfs(seq)\n", + " \n", + " if len(orfs) == 0:\n", + " return ''\n", + " elif n == 1 or len(orfs) == 1:\n", + " return seq[orfs[0][0]:orfs[0][1]]\n", + " else:\n", + " return_list = []\n", + " for i in range(min(n, len(orfs))):\n", + " return_list.append(seq[orfs[i][0]:orfs[i][1]])\n", + " \n", + " return tuple(return_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we'll compute the ORFs, translate them, and make a FASTA file to submit for a BLAST search." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# Compute ORFs\n", + "orfs = longest_orf(seq, n=5)\n", + "\n", + "# Translate them\n", + "prots = []\n", + "for orf in orfs:\n", + " prots.append(translate(orf))\n", + " \n", + "# Make a FASTA file\n", + "with open('sal_seqs.faa', 'w') as f:\n", + " for i, prot in enumerate(prots):\n", + " f.write('> {0:d}\\n'.format(i))\n", + " f.write(prot + '\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can take a look at it to see what I did with the above code." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 0\n", + "MTNYSLRARMMILILAPTVLIGLLLSIFFVVHRYNDLQRQLEDAGASIIEPLAVSSEYGMNLQNRESIGQLISVLHRRHSDIVRAISVYDDHNRLFVTSNFHLDPSQMQLPAGAPFPRRLSVDRHGDIMILRTPIISESYSPDESAIADAKNTKNMLGYVALELDLKSVRLQQYKEIFISSVMMLFCIGIALIFGWRLMRDVTGPIRNMVNTVDRIRRGQLDSRVEGFMLGELDMLKNGINSMAMSLAAYHEEMQHNIDQATSDLRETLEQMEIQNVELDLAKKRAQEAARIKSEFLANMSHELRTPLNGVIGFTRLTLKTELNPTQRDHLNTIERSANNLLAIINDVLDFSKLEAGKLILESIPFPLRNTLDEVVTLLAHSSHDKGLELTLNIKNDVPDNVIGDPLRLQQVITNLVGNAIKFTESGNIDILVEKRALSNTKVQIEVQIRDTGIGIPERDQSRLFQAFRQADASISRRHGGTGLGLVITQKLVNEMGGDISFHSQPNRGSTFWFHINLDLNPNVIIDGPSTACLAGKRLAYVEPNATAAQCTLDLLSDTPVEVVYSPTFSALPLAHYDIMILSVPVTFREPLTMQHERLAKAASMTDFLLLALPCHAQINAEKLKQGGAAACLLKPLTSTRLLPALTEYCQLNHHPEPLLMDTSKITMTVMAVDDNPANLKLIGALLEDKVQHVELCDSGHQAVDRAKQMQFDLILMDIQMPDMDGIRACELIHQLPHQQQTPVIAVTAHAMAGQKEKLLSAGMNDYLAKPIEEEKLHNLLLRYKPGANVAARLMAPEPAEFIFNPNATLDWQLALRQAAGKPDLARDMLQMLIDFLPEVRNKIEEQLVGENPNGLVDLVHKLHGSCGYSGVPRMKNLCQLIEQQLRSGVHEEELEPEFLELLDEMDNVAREAKKILG\n", + "> 1\n", + "MNESFDKDFSNHTPMMQQYLKLKAQHPEILLFYRMGDFYELFYDDAKRASQLLDISLTKRGASAGEPIPMAGIPHHAVENYLAKLVNQGESVAICEQIGDPATSKGPVERKVVRIVTPGTISDEALLQERQDNLLAAIWQDGKGYGYATLDISSGRFRLSEPADRETMAAELQRTNPAELLYAEDFAEMALIEGRRGLRRRPLWEFEIDTARQQLNLQFGTRDLVGFGVENASRGLCAAGCLLQYVKDTQRTSLPHIRSITMERQQDSIIMDAATRRNLEITQNLAGGVENTLAAVLDCTVTPMGSRMLKRWLHMPVRNTDILRERQQTIGALQDTVSELQPVLRQVGDLERILARLALRTARPRDLARMRHAFQQLPELHAQLETVDSAPVQALRKKMGDFAELRDLLERAIIDAPPVLVRDGGVIAPGYHEELDEWRALADGATDYLDRLEIRERERTGLDTLKVGYNAVHGYYIQISRGQSHLAPINYVRRQTLKNAERYIIPELKEYEDKVLTSKGKALALEKQLYDELFDLLLPHLADLQQSANALAELDVLVNLAERAWTLNYTCPTFTDKPGIRITEGRHPVVEQVLNEPFIANPLNLSPQRRMLIITGPNMGGKSTYMRQTALIALLAYIGSYVPAQNVEIGPIDRIFTRVGAADDLASGRSTFMVEMTETANILHNATENSLVLMDEIGRGTSTYDGLSLAWACAENLANKIKALTLFATHYFELTQLPEKMEGVANVHLDALEHGDTIAFMHSVQDGAASKSYGLAVAALAGVPKEVIKRARQKLRELESISPNAAATQVDGTQMSLLAAPEETSPAVEALENLDPDSLTPRQALEWIYRLKSLV\n", + "> 2\n", + "MSYTPMSDLGQQGLFDITRTLLQQPDLASLSEALSQLVKRSALADSAGIVLWQAQSQRAQYYATRENGRPVEYEDETVLAHGPVRRILSRPDALHCNFHEFTETWPQLAASGLYPEFGHYCLLPLAAEGRIFGGCEFIRQEDRPWSEKEYDRLHTFTQIVGVVAEQIQNRVNNNVDYDLLCRERDNFRILVAITNAVLSRLDIDELVSEVAKEIHHYFNIDAISIVLRSHRKNKLNIYSTHYLDEHHPAHEQSEVDEAGTLTERVFKSKEMLLINLNERDPLAPYERMLFDTWGNQIQTLCLLPLMSGKTMLGVLKLAQCEEKVFTTANLKLLRQIAERVAIAVDNALAYQEIHRLKERLVDENLALTEQLNNVDSEFGEIIGRSEAMYNVLKQVEMVAQSDSTVLILGETGTGKELIARAIHNLSGRSGRRMVKMNCAAMPAGLLESDLFGHERGAFTGASAQRIGRFELADKSSLFLDEVGDMPLELQPKLLRVLQEQEFERLGSNKLIQTDVRLIAATNRDLKKMVADREFRNDLYYRLNVFPIQLPPLRERPEDIPLLVKAFTFKIARRMGRNIDSIPAETLRTLSSMEWPGNVRELENVVERAVLLTRGNVLQLSLPDITAVTPDTSPVATESAKEGEDEYQLIIRVLKETNGVVAGPKGAAQRLGLKRTTLLSRMKRLGIDKDALA\n", + "> 3\n", + "MKKISLPKIGIRPVIDGRRMGVRESLEEQTMNMAKATAALITEKIRHACGAQVECVIADTCIAGMAESAACEEKFSSQNVGVTITVTPCWCYGSETIDMDPMRPKAIWGFNGTERPGAVYLAAALAAHSQKGIPAFSIYGHDVQDADDTSIPADVEEKLLRFARAGLAVASMKGKSYLSVGGVSMGIAGSIVDHNFFESWLGMKVQAVDMTELRRRIDQKIYDEAELEMALAWADKNFRYGEDQNASQYKRNEAQNRAVLKESLLMAMCIRDMMQGNKTLADKGLVEESLGYNAIAAGFQGQRHWTDQYPNGDTAEALLNSSFDWNGVREPFVVATENDSLNGVAMLFGHQLTGTAQIFADVRTYWSPEAVERVTGQALSGLAEHGIIHLINSGSAALDGACKQRDSEGKPTMKPHWEISQQEADACLAATEWCPAIHEYFRGGGYSSRFLTEGGVPFTMTRVNIIKGLGPVLQIAEGWSVELPKAMHDQLDARTNSTWPTTWFAPRLTGKGPFTDVYSVMANWGANHGVLTIGHVGADFITLAAMLRIPVCMHNVEEAKIYRPSAWAAHGMDIEGQDYRACQNYGPLYKR\n", + "> 4\n", + "MPHFNPVPVSNKKFVFDDFILNMDGSLLRSEKKVNIPPKEYAVLVILLEAAGEIVSKNTLLDQVWGDAEVNEESLTRCIYALRRILSEDKEHRYIETLYGQGYRFNRPVVVVSPPAPQPTTHTLAILPFQMQDQVQSESLHYSIVKGLSQYAPFGLSVLPVTITKNCRSVKDILELMDQLRPDYYISGQMIPDGNDNIVQIEIVRVKGYHLLHQESIKLIEHQPASLLQNKIANLLLRCIPGLRWDTKQISELNSIDSTMVYLRGKHELNQYTPYSLQQALKLLTQCVNMSPNSIAPYCALAECYLSMAQMGIFDKQNAMIKAKEHAIKATELDHNNPQALGLLGLINTIHSEYIVGSLLFKQANLLSPISADIKYYYGWNLFMAGQLEEALQTINECLKLDPTRAAAGITKLWITYYHTGIDDAIRLGDELRSQHLQDNPILLSMQVMFLSLKGKHELARKLTKEISTQEITGLIAVNLLYAEYCQNSERALPTIREFLESEQRIDNNPGLLPLVLVAHGEAIAEKMWNKFKNEDNIWFKRWKQDPRLIKLR\n" + ] + } + ], + "source": [ + "!cat sal_seqs.faa" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Upon doing the BLAST search, I found that the genes are:\n", + "\n", + "|Length rank| Description|\n", + "|:---:|:---:|\n", + "|1 | histine kinase |\n", + "|2 | DNA repair protein MutS|\n", + "|3 | formate hydrogenlyase transcriptional activator|\n", + "|4 | L-fucose isomerase |\n", + "|5 | transcriptional regulator HilA (invasion regulator)|" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Computing environment" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Python implementation: CPython\n", + "Python version : 3.11.3\n", + "IPython version : 8.12.0\n", + "\n", + "re : 2.2.1\n", + "bootcamp_utils: 0.0.7\n", + "jupyterlab : 3.6.3\n", + "\n" + ] + } + ], + "source": [ + "%load_ext watermark\n", + "%watermark -v -p re,bootcamp_utils,jupyterlab" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/2024/_sources/exercise_solutions/exercise_2/index.rst.txt b/2024/_sources/exercise_solutions/exercise_2/index.rst.txt new file mode 100644 index 00000000..5811bee9 --- /dev/null +++ b/2024/_sources/exercise_solutions/exercise_2/index.rst.txt @@ -0,0 +1,11 @@ +****************************************************************** +Exercise 2 solutions +****************************************************************** + +.. toctree:: + :maxdepth: 1 + + exercise_2.1_solution.ipynb + exercise_2.2_solution.ipynb + exercise_2.3_solution.ipynb + exercise_2.4_solution.ipynb diff --git a/2024/_sources/lessons/bootcamp_live/Untitled.ipynb.txt b/2024/_sources/lessons/bootcamp_live/Untitled.ipynb.txt deleted file mode 100644 index 60909096..00000000 --- a/2024/_sources/lessons/bootcamp_live/Untitled.ipynb.txt +++ /dev/null @@ -1,1637 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "61bc3f6a-870a-4903-a5df-c120add940d2", - "metadata": {}, - "outputs": [], - "source": [ - "my_list = [1, 2, 3, 4]" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "9468b95a-6692-4825-8dde-823ea531f79b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "list" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "type(my_list)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "b7fab275-9ab5-4eba-833d-11404b96f1d8", - "metadata": {}, - "outputs": [], - "source": [ - "my_list = [1, 2.4, 'a string', ['a string in another list', 5]]" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "6f3772ff-c06d-47bf-a860-6a5b515a0e65", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[5, 15, 16]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_list = [2 + 3, 5 * 3, 4**2]\n", - "\n", - "my_list" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "fe9c81ff-ecbf-4f2b-a4ac-5d25d3db9ab2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "42" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "int('42')" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "fbc51335-b5fe-45cd-90bc-1abe1fc02841", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['a', ' ', 's', 't', 'r', 'i', 'n', 'g']" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "list('a string')" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "e3b449d9-277b-45db-bf60-d440d6918950", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[1, 2, 3, 4, 5, 6]" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "[1, 2, 3] + [4, 5, 6] " - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "3b665855-9e00-4eb4-a6a5-6a6fb4cf5920", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[1, 2, 3, 1, 2, 3, 1, 2, 3]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "[1, 2, 3] * 3" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "010ef92b-fa37-48cd-a784-680e90de0cc6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[5, 15, 16]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_list" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "30b6dc71-73bf-43b9-bc17-16ca54cb14cc", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "15 in my_list" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "735ff7a5-b11d-4a28-ba1f-bd70e97a1641", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "'jeffrey lebowski' not in my_list" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "65755e75-be34-4f72-a333-6f8c505190c8", - "metadata": {}, - "outputs": [], - "source": [ - "my_list = [1, 2.4, 'a string', ['a string in another list', 5]]" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "c8f09808-828d-4c21-9040-c854aad64562", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[1, 2.4, 'a string', ['a string in another list', 5]]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_list" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "1c0cdfb0-5b8b-4361-b40a-1c91011ecbed", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "['a string in another list', 5] in my_list" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "id": "f4962ab0-4b4c-49a2-a5fc-ab1fe9e922d5", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "This codon is a stop codon.\n" - ] - } - ], - "source": [ - "codon = 'UAA'\n", - "\n", - "if codon == 'AUG':\n", - " print('This codon is the start codon.')\n", - "elif codon in ('UAA', 'UAG', 'UGA'):\n", - " print('This codon is a stop codon.')\n", - "else:\n", - " print('This codon is neither a start nor stop codon.')" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "e1f9fb88-9f0d-4e1a-834b-842c1cbc0d33", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[1, 2.4, 'a string', ['a string in another list', 5]]" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_list" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "f9dfa49b-6a6f-414b-872d-67bc015f6646", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'a string'" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_list[2]" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "60db5f5f-605f-4684-9296-57948133cef0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[1, 2.4, 'a string', ['a string in another list', 5]]" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_list" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "1ab50364-b2cb-493d-8e7d-f3063ef4fafb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "5" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_list[3][1]" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "52960b27-9415-4e39-9824-35de067be06d", - "metadata": {}, - "outputs": [], - "source": [ - "my_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "a649fc66-50f5-466a-bfd8-eee2a11d6b75", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "4" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_list[4]" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "35517bda-ce6e-40e4-b217-833fa52373b7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_list[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "c3de8081-c330-4d34-bee0-2ae7a5a56985", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_list" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "49cd38ec-615b-4721-8c01-4d37125ada27", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[9, 7, 5, 3]" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_list[-2:2:-2] " - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "49dd5809-ce1b-4360-bc98-27d645ab9047", - "metadata": {}, - "outputs": [], - "source": [ - "my_slice = my_list[1:7:-3]" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "id": "1d7f98d9-d3d0-42b6-a628-bff1fcd07c97", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]" - ] - }, - "execution_count": 50, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_list" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "e02cc9ba-0b04-4508-91b1-0d8df34c7dfb", - "metadata": {}, - "outputs": [], - "source": [ - "my_list[4] = 'four'" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "bf6bab8c-111d-4219-a8b3-ff2baaf0e34a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[0, 1, 2, 3, 'four', 5, 6, 7, 8, 9, 10]" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_list" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "id": "eaae8e68-d031-4345-af11-14a9d66a5476", - "metadata": {}, - "outputs": [], - "source": [ - "my_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n", - "my_list2 = my_list\n", - "\n", - "my_list2[0] = 'a'" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "id": "94ab0c98-5a4f-427b-a5e0-c8f2e44fd733", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['a', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_list2" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "id": "497d6aba-3c41-4b3e-ace3-637741d8048f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['a', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_list" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "id": "d09c3dbb-62cc-41ae-bbad-40864beb7a7e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 61, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_list is my_list2" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "3c6d3742-e3a8-40a3-92c2-8d480b1b63cf", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "7" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "a = 5\n", - "b = 7\n", - "a = b\n", - "\n", - "a" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "id": "771f206d-f33b-4f45-937d-5322124cd217", - "metadata": {}, - "outputs": [], - "source": [ - "my_tuple = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "id": "d77ccc15-5f15-4629-bba2-5ad96b7682b4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)" - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_tuple" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "id": "1613cc56-5faf-421d-b76a-b16a6898fde3", - "metadata": {}, - "outputs": [], - "source": [ - "my_tuple2 = my_tuple" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "id": "7cf257da-bea8-4ccd-b59d-19e5c015ea1d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 67, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_tuple2 is my_tuple" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "id": "43892cb6-658e-4ecf-89e6-a12c454e9782", - "metadata": {}, - "outputs": [], - "source": [ - "my_tuple = (5, 6, 7)" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "id": "ef0e66f8-728b-4588-be63-aeda22abe6ef", - "metadata": {}, - "outputs": [], - "source": [ - "a, b, c = my_tuple" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "id": "65e4ec42-83f3-4794-a8d4-1a014a1a0d9c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "5" - ] - }, - "execution_count": 70, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "a" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "id": "c5d7fc1b-829e-409d-b991-6c2fc42a6fbb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "6" - ] - }, - "execution_count": 71, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "b" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "id": "12fda73e-16b1-4e64-8960-e590f3aa9d11", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "7" - ] - }, - "execution_count": 72, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "c" - ] - }, - { - "cell_type": "code", - "execution_count": 89, - "id": "9b483621-600c-4d9a-a940-77743f0ec47b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2 5 6 9 11 12 14 16 19 20 22 23 24 25 26 31 32 34 " - ] - } - ], - "source": [ - "seq = 'UACUACGAUCAGGACUGAUCGACGCGCUAUACGACUA'\n", - "\n", - "for i, base in enumerate(seq):\n", - " if base in 'GCgc':\n", - " print(i, end=' ')" - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "id": "94910ea0-afc5-4177-ae44-7debcc9541b8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "37" - ] - }, - "execution_count": 75, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(seq)" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "id": "d8847189-3c93-4ce1-8ee9-302592d26366", - "metadata": {}, - "outputs": [], - "source": [ - "my_integers = [1, 2, 3, 4, 5]\n", - "\n", - "for i in range(len(my_integers)):\n", - " my_integers[i] *= 2" - ] - }, - { - "cell_type": "code", - "execution_count": 86, - "id": "00af6e15-ecce-402c-bb70-89ece2685040", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[2, 4, 6, 8, 10]" - ] - }, - "execution_count": 86, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_integers" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "id": "22b0d933-99b2-4389-9653-1882ca6d2a70", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[0, 1, 2, 3, 4]" - ] - }, - "execution_count": 84, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "list(range(5))" - ] - }, - { - "cell_type": "code", - "execution_count": 90, - "id": "aefb5663-c2e9-45d9-a594-3ad5f85f871a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "23 Acosta MF\n", - "3 Murillo D\n", - "11 Bale F\n" - ] - } - ], - "source": [ - "names = ('Acosta', 'Murillo', 'Bale')\n", - "positions = ('MF', 'D', 'F')\n", - "numbers = (23, 3, 11)\n", - "\n", - "for num, pos, name in zip(numbers, positions, names):\n", - " print(num, name, pos)" - ] - }, - { - "cell_type": "code", - "execution_count": 92, - "id": "3d044c33-d318-4069-b9e2-4bc1953fea30", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "10\n", - "9\n", - "8\n", - "7\n", - "6\n", - "5\n", - "4\n", - "3\n", - "2\n", - "1\n", - "ignition\n" - ] - } - ], - "source": [ - "count_up = ('ignition', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)\n", - "\n", - "for count in reversed(count_up):\n", - " print(count)" - ] - }, - { - "cell_type": "code", - "execution_count": 98, - "id": "2b281a56-ba37-4052-98cf-3d80bdc6067a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Didn't find codon.\n" - ] - } - ], - "source": [ - "seq = 'UAGUACUACUAGUAUGAUGCCAUCCCUA'\n", - "codon = 'GGG'\n", - "\n", - "i = 0\n", - "\n", - "while seq[i:i+3] != codon and i < len(seq):\n", - " i += 1\n", - "\n", - "if i == len(seq):\n", - " print(\"Didn't find codon.\")\n", - "else:\n", - " print('The index of the codon is', i)" - ] - }, - { - "cell_type": "code", - "execution_count": 96, - "id": "bc2a2d22-4792-4d7d-a9fa-58089b71862b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "''" - ] - }, - "execution_count": 96, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "seq[100:103]" - ] - }, - { - "cell_type": "code", - "execution_count": 99, - "id": "cf51940d-8b3e-4093-b7f0-93f65e8568cd", - "metadata": {}, - "outputs": [], - "source": [ - "def ratio(x, y):\n", - " \"\"\"The ratio of `x` to `y`.\"\"\"\n", - " return x / y" - ] - }, - { - "cell_type": "code", - "execution_count": 102, - "id": "1c530201-d085-418d-beb7-7008767a82d8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2.0" - ] - }, - "execution_count": 102, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ratio(4, 2)" - ] - }, - { - "cell_type": "code", - "execution_count": 103, - "id": "56d1184f-7d81-46e8-9724-25e82942fb4a", - "metadata": {}, - "outputs": [], - "source": [ - "def answer_to_the_ultimate_question_of_life_the_universe_and_everything():\n", - " \"\"\"Simpler program that Deep Thgouth's, I bet.\"\"\"\n", - " return 42" - ] - }, - { - "cell_type": "code", - "execution_count": 104, - "id": "5f5b7b87-3e5e-4fd9-9388-a2277727bea0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "42" - ] - }, - "execution_count": 104, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "answer_to_the_ultimate_question_of_life_the_universe_and_everything()" - ] - }, - { - "cell_type": "code", - "execution_count": 105, - "id": "7ebfe2da-794f-4f7d-bc4a-853b9bfe9dff", - "metadata": {}, - "outputs": [], - "source": [ - "def think_too_much():\n", - " \"\"\"Express Caesar's skepticism about Cassius.\"\"\"\n", - " print(\"\"\"Yond Cassius has a lean and hungry look,\n", - "He thinks too much; such men are dangerous.\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "id": "4a845090-18fa-4061-8ac0-13265c89d6c7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Yond Cassius has a lean and hungry look,\n", - "He thinks too much; such men are dangerous.\n" - ] - } - ], - "source": [ - "return_val = think_too_much()" - ] - }, - { - "cell_type": "code", - "execution_count": 114, - "id": "ff8400ab-90e5-48fe-88b5-1748fe6de5b9", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "None\n" - ] - } - ], - "source": [ - "print(return_val)" - ] - }, - { - "cell_type": "code", - "execution_count": 107, - "id": "32001416-baec-4017-ab83-8958f3477521", - "metadata": {}, - "outputs": [], - "source": [ - "def evens_up_to_8():\n", - " return 2, 4, 6, 8" - ] - }, - { - "cell_type": "code", - "execution_count": 115, - "id": "ee7d4cb5-ec8c-433b-a2bb-0b67ad192c68", - "metadata": {}, - "outputs": [], - "source": [ - "a, b, c, d = evens_up_to_8()" - ] - }, - { - "cell_type": "code", - "execution_count": 117, - "id": "f19293e5-c7a8-400b-98bb-64ad1e2ce880", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "6" - ] - }, - "execution_count": 117, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "c" - ] - }, - { - "cell_type": "code", - "execution_count": 126, - "id": "f6afd3c7-4b51-40a8-9425-125715b71aef", - "metadata": {}, - "outputs": [], - "source": [ - "def complement_base(base, material='DNA'):\n", - " \"\"\"Return the Watson-Crick complement of a base.\"\"\"\n", - " if base in 'Aa':\n", - " if material == 'DNA':\n", - " return 'T'\n", - " elif material == 'RNA':\n", - " return 'U'\n", - " elif base in 'TtUu':\n", - " return 'A'\n", - " elif base in 'Gg':\n", - " return 'C'\n", - " elif base in 'Cc':\n", - " return 'G'\n", - " else:\n", - " return ''\n", - " \n", - "\n", - "def reverse_complement(seq, material='DNA'):\n", - " \"\"\"Compute the reverse of a sequence.\"\"\"\n", - " # Initialize the rev comp\n", - " rev_seq = ''\n", - "\n", - " # Loop through in reverse and add each base\n", - " for base in reversed(seq):\n", - " rev_seq += complement_base(base, material)\n", - "\n", - " return rev_seq" - ] - }, - { - "cell_type": "code", - "execution_count": 130, - "id": "f8cf4dac-15c3-4f0c-828c-d946c3252023", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'UGCAACUGC'" - ] - }, - "execution_count": 130, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "reverse_complement(seq='GCAGUUGCA', material='RNA')" - ] - }, - { - "cell_type": "code", - "execution_count": 138, - "id": "a1ba54ff-3540-41de-982e-4f036f756f40", - "metadata": {}, - "outputs": [], - "source": [ - "def is_almost_right(a, b, c):\n", - " \"\"\"Check to see if a triangle with side lengths a, b, and c is right.\"\"\"\n", - " # Use sorted() to make sure c is largest\n", - " a, b, c = sorted([a, b, c])\n", - "\n", - " if abs(a**2 + b**2 - c**2) < 1e-12:\n", - " return True\n", - " else:\n", - " return False\n" - ] - }, - { - "cell_type": "code", - "execution_count": 143, - "id": "f53c0101-24a9-4e2c-b48a-0adb803aed11", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 143, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "is_almost_right(5, 12, 13)" - ] - }, - { - "cell_type": "code", - "execution_count": 144, - "id": "75bb3a9f-fba8-4e7d-b953-80053c1a6058", - "metadata": {}, - "outputs": [], - "source": [ - "side_lengths = (5, 12, 13)" - ] - }, - { - "cell_type": "code", - "execution_count": 146, - "id": "8f51234e-ba43-4813-b193-3ecc349076c3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 146, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "is_almost_right(*side_lengths) " - ] - }, - { - "cell_type": "code", - "execution_count": 147, - "id": "922cf9d8-4ee2-406c-aedc-a9411993e5d2", - "metadata": {}, - "outputs": [], - "source": [ - "def ratio(x, y):\n", - " \"\"\"ratio\"\"\"\n", - " return x / y" - ] - }, - { - "cell_type": "code", - "execution_count": 148, - "id": "88a7cd90-bd44-434e-8007-f46c42cf583b", - "metadata": {}, - "outputs": [], - "source": [ - "ratio = lambda x, y: x / y" - ] - }, - { - "cell_type": "code", - "execution_count": 149, - "id": "b4d7b4a8-eaeb-4194-b7e0-68b6d11f7c32", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.7142857142857143" - ] - }, - "execution_count": 149, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ratio(5, 7)" - ] - }, - { - "cell_type": "code", - "execution_count": 157, - "id": "d4e00fd9-5e3a-494e-aa40-ad9aa78d16be", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Kellyn Acosta', 'Gareth Bale', 'Jesus Murillo']" - ] - }, - "execution_count": 157, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sorted(['Kellyn Acosta', 'Jesus Murillo', 'Gareth Bale'], key=lambda x: x[x.find(' ')+1:])" - ] - }, - { - "cell_type": "code", - "execution_count": 158, - "id": "4f2399bc-0c79-4fbd-aacf-5b9efd3d5228", - "metadata": {}, - "outputs": [], - "source": [ - "last_name = lambda x: x[x.find(' ')+1:]" - ] - }, - { - "cell_type": "code", - "execution_count": 159, - "id": "716eb5ef-6b6a-4fc5-941a-9da9d7b49c54", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Lebowski'" - ] - }, - "execution_count": 159, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "last_name(\"Jeffrey Lebowski\")" - ] - }, - { - "cell_type": "code", - "execution_count": 161, - "id": "e8ed68b7-de26-40bb-be4f-8225a4413650", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'.sediba eduD ehT'" - ] - }, - "execution_count": 161, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_str = 'The Dude abides.'\n", - "\n", - "my_str[::-1]" - ] - }, - { - "cell_type": "code", - "execution_count": 164, - "id": "ab1337a0-547c-44fc-bf35-33f53d2c66a4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.5428571428571428" - ] - }, - "execution_count": 164, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "seq = 'ATCGATCGCTTCTAGGCGATCGTACGATCGACTGC'\n", - "\n", - "(seq.count('G') + seq.count('C')) / len(seq)" - ] - }, - { - "cell_type": "code", - "execution_count": 165, - "id": "5172ca87-3eef-49ce-b47e-cb3b73134ab8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "6" - ] - }, - "execution_count": 165, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "'AGTAGATACAGTATAGTAGT'.count('T')" - ] - }, - { - "cell_type": "code", - "execution_count": 168, - "id": "9eed63b7-a76b-40da-9383-34970e260126", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0" - ] - }, - "execution_count": 168, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "'AAAAAAA'.count('nonsense')" - ] - }, - { - "cell_type": "code", - "execution_count": 171, - "id": "3836b019-949d-4cc5-a9bb-84b51d170136", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "25" - ] - }, - "execution_count": 171, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "'AGATCGAGAUAGAUGATCGATCAGGGATCG'.rfind('GAT')" - ] - }, - { - "cell_type": "code", - "execution_count": 173, - "id": "83e13637-347d-48c6-9b19-3239d7b25eec", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'AGTGAGATGAG'" - ] - }, - "execution_count": 173, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "'AGTGAGATGAG'.lower().upper()" - ] - }, - { - "cell_type": "code", - "execution_count": 175, - "id": "6631a01c-da7d-4da5-9f49-ad326273601c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'The*Dude*abides.'" - ] - }, - "execution_count": 175, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "word_tuple = ('The', 'Dude', 'abides.')\n", - "\n", - "'*'.join(word_tuple)" - ] - }, - { - "cell_type": "code", - "execution_count": 180, - "id": "8df46e1f-6af7-4302-947a-88d684764f13", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "During this bootcamp, I feel tiny.\n", - "The instructors give us flowers.\n", - "\n" - ] - } - ], - "source": [ - "adjective = 'tiny'\n", - "plural_noun = 'flowers'\n", - "\n", - "my_str = f\"\"\"\n", - "During this bootcamp, I feel {adjective}.\n", - "The instructors give us {plural_noun}.\n", - "\"\"\"\n", - "\n", - "print(my_str)" - ] - }, - { - "cell_type": "code", - "execution_count": 182, - "id": "2a5acfeb-2e4c-4ef8-b0fc-785c74826e79", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'There are 0050 states in the US.'" - ] - }, - "execution_count": 182, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "'There are {n:04d} states in the US.'.format(n=50)" - ] - }, - { - "cell_type": "code", - "execution_count": 184, - "id": "c8b29be1-f827-443b-b35c-660ff5f790f9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'π is approximately 3.141593e+00'" - ] - }, - "execution_count": 184, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pi = 3.1415926535\n", - "f'π is approximately {pi:.6e}'" - ] - }, - { - "cell_type": "code", - "execution_count": 185, - "id": "89d7a73b-32fe-4a3b-8fb9-e9603b3c99fe", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1000000000000" - ] - }, - "execution_count": 185, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "1_000_000_000_000" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6ef7922c-fda7-4e76-9c24-a365467fbd37", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/2024/_sources/lessons/merging_dataframes.ipynb.txt b/2024/_sources/lessons/merging_dataframes.ipynb.txt new file mode 100644 index 00000000..7a8df685 --- /dev/null +++ b/2024/_sources/lessons/merging_dataframes.ipynb.txt @@ -0,0 +1,2975 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Merging and concatenating data frames\n", + "\n", + "[Data set download](https://s3.amazonaws.com/bebi103.caltech.edu/data/frog_strikes.zip)\n", + "\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "nbsphinx": "hidden", + "tags": [] + }, + "outputs": [], + "source": [ + "# Colab setup ------------------\n", + "import os, sys, subprocess\n", + "if \"google.colab\" in sys.modules:\n", + " cmd = \"pip install --upgrade iqplot watermark\"\n", + " process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n", + " stdout, stderr = process.communicate()\n", + " data_path = \"https://s3.amazonaws.com/bebi103.caltech.edu/data/\"\n", + "else:\n", + " data_path = \"../data/\"\n", + "# ------------------------------" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " Loading BokehJS ...\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function now() {\n", + " return new Date();\n", + " }\n", + "\n", + " const force = true;\n", + "\n", + " if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n", + " root._bokeh_onload_callbacks = [];\n", + " root._bokeh_is_loading = undefined;\n", + " }\n", + "\n", + "const JS_MIME_TYPE = 'application/javascript';\n", + " const HTML_MIME_TYPE = 'text/html';\n", + " const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n", + " const CLASS_NAME = 'output_bokeh rendered_html';\n", + "\n", + " /**\n", + " * Render data to the DOM node\n", + " */\n", + " function render(props, node) {\n", + " const script = document.createElement(\"script\");\n", + " node.appendChild(script);\n", + " }\n", + "\n", + " /**\n", + " * Handle when an output is cleared or removed\n", + " */\n", + " function handleClearOutput(event, handle) {\n", + " const cell = handle.cell;\n", + "\n", + " const id = cell.output_area._bokeh_element_id;\n", + " const server_id = cell.output_area._bokeh_server_id;\n", + " // Clean up Bokeh references\n", + " if (id != null && id in Bokeh.index) {\n", + " Bokeh.index[id].model.document.clear();\n", + " delete Bokeh.index[id];\n", + " }\n", + "\n", + " if (server_id !== undefined) {\n", + " // Clean up Bokeh references\n", + " const cmd_clean = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n", + " cell.notebook.kernel.execute(cmd_clean, {\n", + " iopub: {\n", + " output: function(msg) {\n", + " const id = msg.content.text.trim();\n", + " if (id in Bokeh.index) {\n", + " Bokeh.index[id].model.document.clear();\n", + " delete Bokeh.index[id];\n", + " }\n", + " }\n", + " }\n", + " });\n", + " // Destroy server and session\n", + " const cmd_destroy = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n", + " cell.notebook.kernel.execute(cmd_destroy);\n", + " }\n", + " }\n", + "\n", + " /**\n", + " * Handle when a new output is added\n", + " */\n", + " function handleAddOutput(event, handle) {\n", + " const output_area = handle.output_area;\n", + " const output = handle.output;\n", + "\n", + " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n", + " if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n", + " return\n", + " }\n", + "\n", + " const toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", + "\n", + " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n", + " toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n", + " // store reference to embed id on output_area\n", + " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", + " }\n", + " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", + " const bk_div = document.createElement(\"div\");\n", + " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", + " const script_attrs = bk_div.children[0].attributes;\n", + " for (let i = 0; i < script_attrs.length; i++) {\n", + " toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n", + " toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n", + " }\n", + " // store reference to server id on output_area\n", + " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", + " }\n", + " }\n", + "\n", + " function register_renderer(events, OutputArea) {\n", + "\n", + " function append_mime(data, metadata, element) {\n", + " // create a DOM node to render to\n", + " const toinsert = this.create_output_subarea(\n", + " metadata,\n", + " CLASS_NAME,\n", + " EXEC_MIME_TYPE\n", + " );\n", + " this.keyboard_manager.register_events(toinsert);\n", + " // Render to node\n", + " const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", + " render(props, toinsert[toinsert.length - 1]);\n", + " element.append(toinsert);\n", + " return toinsert\n", + " }\n", + "\n", + " /* Handle when an output is cleared or removed */\n", + " events.on('clear_output.CodeCell', handleClearOutput);\n", + " events.on('delete.Cell', handleClearOutput);\n", + "\n", + " /* Handle when a new output is added */\n", + " events.on('output_added.OutputArea', handleAddOutput);\n", + "\n", + " /**\n", + " * Register the mime type and append_mime function with output_area\n", + " */\n", + " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", + " /* Is output safe? */\n", + " safe: true,\n", + " /* Index of renderer in `output_area.display_order` */\n", + " index: 0\n", + " });\n", + " }\n", + "\n", + " // register the mime type if in Jupyter Notebook environment and previously unregistered\n", + " if (root.Jupyter !== undefined) {\n", + " const events = require('base/js/events');\n", + " const OutputArea = require('notebook/js/outputarea').OutputArea;\n", + "\n", + " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", + " register_renderer(events, OutputArea);\n", + " }\n", + " }\n", + " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", + " root._bokeh_timeout = Date.now() + 5000;\n", + " root._bokeh_failed_load = false;\n", + " }\n", + "\n", + " const NB_LOAD_WARNING = {'data': {'text/html':\n", + " \"
\\n\"+\n", + " \"

\\n\"+\n", + " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", + " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", + " \"

\\n\"+\n", + " \"\\n\"+\n", + " \"\\n\"+\n", + " \"from bokeh.resources import INLINE\\n\"+\n", + " \"output_notebook(resources=INLINE)\\n\"+\n", + " \"\\n\"+\n", + " \"
\"}};\n", + "\n", + " function display_loaded() {\n", + " const el = document.getElementById(\"fe0062d8-4ea1-42b8-bc8e-166e0e29d52f\");\n", + " if (el != null) {\n", + " el.textContent = \"BokehJS is loading...\";\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " if (el != null) {\n", + " el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n", + " }\n", + " } else if (Date.now() < root._bokeh_timeout) {\n", + " setTimeout(display_loaded, 100)\n", + " }\n", + " }\n", + "\n", + " function run_callbacks() {\n", + " try {\n", + " root._bokeh_onload_callbacks.forEach(function(callback) {\n", + " if (callback != null)\n", + " callback();\n", + " });\n", + " } finally {\n", + " delete root._bokeh_onload_callbacks\n", + " }\n", + " console.debug(\"Bokeh: all callbacks have finished\");\n", + " }\n", + "\n", + " function load_libs(css_urls, js_urls, callback) {\n", + " if (css_urls == null) css_urls = [];\n", + " if (js_urls == null) js_urls = [];\n", + "\n", + " root._bokeh_onload_callbacks.push(callback);\n", + " if (root._bokeh_is_loading > 0) {\n", + " console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", + " return null;\n", + " }\n", + " if (js_urls == null || js_urls.length === 0) {\n", + " run_callbacks();\n", + " return null;\n", + " }\n", + " console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", + " root._bokeh_is_loading = css_urls.length + js_urls.length;\n", + "\n", + " function on_load() {\n", + " root._bokeh_is_loading--;\n", + " if (root._bokeh_is_loading === 0) {\n", + " console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n", + " run_callbacks()\n", + " }\n", + " }\n", + "\n", + " function on_error(url) {\n", + " console.error(\"failed to load \" + url);\n", + " }\n", + "\n", + " for (let i = 0; i < css_urls.length; i++) {\n", + " const url = css_urls[i];\n", + " const element = document.createElement(\"link\");\n", + " element.onload = on_load;\n", + " element.onerror = on_error.bind(null, url);\n", + " element.rel = \"stylesheet\";\n", + " element.type = \"text/css\";\n", + " element.href = url;\n", + " console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n", + " document.body.appendChild(element);\n", + " }\n", + "\n", + " for (let i = 0; i < js_urls.length; i++) {\n", + " const url = js_urls[i];\n", + " const element = document.createElement('script');\n", + " element.onload = on_load;\n", + " element.onerror = on_error.bind(null, url);\n", + " element.async = false;\n", + " element.src = url;\n", + " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", + " document.head.appendChild(element);\n", + " }\n", + " };\n", + "\n", + " function inject_raw_css(css) {\n", + " const element = document.createElement(\"style\");\n", + " element.appendChild(document.createTextNode(css));\n", + " document.body.appendChild(element);\n", + " }\n", + "\n", + " const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.2.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.2.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.2.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.2.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.2.1.min.js\"];\n", + " const css_urls = [];\n", + "\n", + " const inline_js = [ function(Bokeh) {\n", + " Bokeh.set_log_level(\"info\");\n", + " },\n", + "function(Bokeh) {\n", + " }\n", + " ];\n", + "\n", + " function run_inline_js() {\n", + " if (root.Bokeh !== undefined || force === true) {\n", + " for (let i = 0; i < inline_js.length; i++) {\n", + " inline_js[i].call(root, root.Bokeh);\n", + " }\n", + "if (force === true) {\n", + " display_loaded();\n", + " }} else if (Date.now() < root._bokeh_timeout) {\n", + " setTimeout(run_inline_js, 100);\n", + " } else if (!root._bokeh_failed_load) {\n", + " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", + " root._bokeh_failed_load = true;\n", + " } else if (force !== true) {\n", + " const cell = $(document.getElementById(\"fe0062d8-4ea1-42b8-bc8e-166e0e29d52f\")).parents('.cell').data().cell;\n", + " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", + " }\n", + " }\n", + "\n", + " if (root._bokeh_is_loading === 0) {\n", + " console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", + " run_inline_js();\n", + " } else {\n", + " load_libs(css_urls, js_urls, function() {\n", + " console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n", + " run_inline_js();\n", + " });\n", + " }\n", + "}(window));" + ], + "application/vnd.bokehjs_load.v0+json": "(function(root) {\n function now() {\n return new Date();\n }\n\n const force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\n\n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n const NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n const el = document.getElementById(\"fe0062d8-4ea1-42b8-bc8e-166e0e29d52f\");\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error(url) {\n console.error(\"failed to load \" + url);\n }\n\n for (let i = 0; i < css_urls.length; i++) {\n const url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n for (let i = 0; i < js_urls.length; i++) {\n const url = js_urls[i];\n const element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.2.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.2.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.2.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.2.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.2.1.min.js\"];\n const css_urls = [];\n\n const inline_js = [ function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\nfunction(Bokeh) {\n }\n ];\n\n function run_inline_js() {\n if (root.Bokeh !== undefined || force === true) {\n for (let i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\nif (force === true) {\n display_loaded();\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n const cell = $(document.getElementById(\"fe0062d8-4ea1-42b8-bc8e-166e0e29d52f\")).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import iqplot\n", + "\n", + "import bokeh.io\n", + "bokeh.io.output_notebook()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It often happens that experiments consist of multiple data files that need to be brought together into a single data frame to work with in exploratory data analysis and subsequent analyses. Through its concatenation and merging capabilities, Pandas provides powerful tools for handling this sort of data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The frog tongue strike data set\n", + "\n", + "As usual, we will work with a real data set to learn about concatenation and merging of data frames. The data set we will use comes from a fun paper about the adhesive properties of frog tongues. The reference is [Kleinteich and Gorb, Tongue adhesion in the horned frog *Ceratophrys sp.*, *Sci. Rep.*, **4**, 5225, 2014](https://dx.doi.org/10.1038%2Fsrep05225). You might also want to check out a *New York Times* feature on the paper [here](http://www.nytimes.com/2014/08/25/science/a-frog-thats-a-living-breathing-pac-man.html).\n", + "\n", + "In this paper, the authors investigated various properties of the adhesive characteristics of the tongues of horned frogs when they strike prey. The authors had a striking pad connected to a cantilever to measure forces. They also used high speed cameras to capture the strike and record relevant data.\n", + "\n", + "To get an idea of the experimental set up, you can check out this movie, kindly sent to me by Thomas Kleinteich. If video does not play in your browser, you may download it [here](kleinteich_frog_strike.mp4).\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### The data files\n", + "\n", + "I pulled data files from the [Kleinteich and Gorb paper](https://dx.doi.org/10.1038%2Fsrep05225). You can download the data files here: [https://s3.amazonaws.com/bebi103.caltech.edu/data/frog_strikes.zip](https://s3.amazonaws.com/bebi103.caltech.edu/data/frog_strikes.zip).\n", + "\n", + "There are four files, one for each of the four frogs, labeled with IDs I, II, III, and IV, that were studied. To see the format of the files, we can look at the content of the file for frog I. You can use\n", + "\n", + " head -n 20 ../data/frog_strikes_I.csv\n", + " \n", + "from the command line. Here is the content of the first data file.\n", + "\n", + "```\n", + "# These data are from Kleinteich and Gorb, Sci. Rep., 4, 5225, 2014.\n", + "# Frog ID: I\n", + "# Age: adult\n", + "# Snout-vent-length (SVL): 63 mm\n", + "# Body weight: 63.1 g\n", + "# Species: Ceratophrys cranwelli crossed with Ceratophrys cornuta\n", + "date,trial number,impact force (mN),impact time (ms),impact force / body weight,adhesive force (mN),time frog pulls on target (ms),adhesive force / body weight,adhesive impulse (N-s),total contact area (mm2),contact area without mucus (mm2),contact area with mucus / contact area without mucus,contact pressure (Pa),adhesive strength (Pa)\n", + "2013_02_26,3,1205,46,1.95,-785,884,1.27,-0.290,387,70,0.82,3117,-2030\n", + "2013_02_26,4,2527,44,4.08,-983,248,1.59,-0.181,101,94,0.07,24923,-9695\n", + "2013_03_01,1,1745,34,2.82,-850,211,1.37,-0.157,83,79,0.05,21020,-10239\n", + "2013_03_01,2,1556,41,2.51,-455,1025,0.74,-0.170,330,158,0.52,4718,-1381\n", + "2013_03_01,3,493,36,0.80,-974,499,1.57,-0.423,245,216,0.12,2012,-3975\n", + "2013_03_01,4,2276,31,3.68,-592,969,0.96,-0.176,341,106,0.69,6676,-1737\n", + "2013_03_05,1,556,43,0.90,-512,835,0.83,-0.285,359,110,0.69,1550,-1427\n", + "2013_03_05,2,1928,46,3.11,-804,508,1.30,-0.285,246,178,0.28,7832,-3266\n", + "2013_03_05,3,2641,50,4.27,-690,491,1.12,-0.239,269,224,0.17,9824,-2568\n", + "2013_03_05,4,1897,41,3.06,-462,839,0.75,-0.328,266,176,0.34,7122,-1733\n", + "2013_03_12,1,1891,40,3.06,-766,1069,1.24,-0.380,408,33,0.92,4638,-1879\n", + "2013_03_12,2,1545,48,2.50,-715,649,1.15,-0.298,141,112,0.21,10947,-5064\n", + "2013_03_12,3,1307,29,2.11,-613,1845,0.99,-0.768,455,92,0.80,2874,-1348\n", + "2013_03_12,4,1692,31,2.73,-677,917,1.09,-0.457,186,129,0.31,9089,-3636\n", + "2013_03_12,5,1543,38,2.49,-528,750,0.85,-0.353,153,148,0.03,10095,-3453\n", + "2013_03_15,1,1282,31,2.07,-452,785,0.73,-0.253,290,105,0.64,4419,-1557\n", + "2013_03_15,2,775,34,1.25,-430,837,0.70,-0.276,257,124,0.52,3019,-1677\n", + "2013_03_15,3,2032,60,3.28,-652,486,1.05,-0.257,147,134,0.09,13784,-4425\n", + "2013_03_15,4,1240,34,2.00,-692,906,1.12,-0.317,364,260,0.28,3406,-1901\n", + "2013_03_15,5,473,40,0.76,-536,1218,0.87,-0.382,259,168,0.35,1830,-2073\n", + "```\n", + "\n", + "The first lines all begin with `#` signs, signifying that they are comments. They do give important information about the frog, though.\n", + "\n", + "The first line after the comments are the headers, giving the column names for the data frame we will load." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Concatenating data frames\n", + "\n", + "We would like to have all of the data frames be together in one data frame so we can conveniently do things like make plots comparing the four frogs. Let's read in the data sets and make a list of data frames." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datetrial numberimpact force (mN)impact time (ms)impact force / body weightadhesive force (mN)time frog pulls on target (ms)adhesive force / body weightadhesive impulse (N-s)total contact area (mm2)contact area without mucus (mm2)contact area with mucus / contact area without mucuscontact pressure (Pa)adhesive strength (Pa)
02013_02_2631205461.95-7858841.27-0.290387700.823117-2030
12013_02_2642527444.08-9832481.59-0.181101940.0724923-9695
22013_03_0111745342.82-8502111.37-0.15783790.0521020-10239
32013_03_0121556412.51-45510250.74-0.1703301580.524718-1381
42013_03_013493360.80-9744991.57-0.4232452160.122012-3975
\n", + "
" + ], + "text/plain": [ + " date trial number impact force (mN) impact time (ms) \\\n", + "0 2013_02_26 3 1205 46 \n", + "1 2013_02_26 4 2527 44 \n", + "2 2013_03_01 1 1745 34 \n", + "3 2013_03_01 2 1556 41 \n", + "4 2013_03_01 3 493 36 \n", + "\n", + " impact force / body weight adhesive force (mN) \\\n", + "0 1.95 -785 \n", + "1 4.08 -983 \n", + "2 2.82 -850 \n", + "3 2.51 -455 \n", + "4 0.80 -974 \n", + "\n", + " time frog pulls on target (ms) adhesive force / body weight \\\n", + "0 884 1.27 \n", + "1 248 1.59 \n", + "2 211 1.37 \n", + "3 1025 0.74 \n", + "4 499 1.57 \n", + "\n", + " adhesive impulse (N-s) total contact area (mm2) \\\n", + "0 -0.290 387 \n", + "1 -0.181 101 \n", + "2 -0.157 83 \n", + "3 -0.170 330 \n", + "4 -0.423 245 \n", + "\n", + " contact area without mucus (mm2) \\\n", + "0 70 \n", + "1 94 \n", + "2 79 \n", + "3 158 \n", + "4 216 \n", + "\n", + " contact area with mucus / contact area without mucus \\\n", + "0 0.82 \n", + "1 0.07 \n", + "2 0.05 \n", + "3 0.52 \n", + "4 0.12 \n", + "\n", + " contact pressure (Pa) adhesive strength (Pa) \n", + "0 3117 -2030 \n", + "1 24923 -9695 \n", + "2 21020 -10239 \n", + "3 4718 -1381 \n", + "4 2012 -3975 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# On a local machine, we would do this: fnames = glob.glob('../data/frog_strikes_*.csv')\n", + "# But for Colab compatibility, we will do it by hand\n", + "fnames = [\n", + " os.path.join(data_path, f\"frog_strikes_{frog_id}.csv\")\n", + " for frog_id in [\"I\", \"II\", \"III\", \"IV\"]\n", + "]\n", + "\n", + "dfs = [pd.read_csv(f, comment=\"#\") for f in fnames]\n", + "\n", + "# Take a look at first data frame\n", + "dfs[0].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have successfully loaded in all of the data frames. They all have the same columns (as given by the CSV files) and they all have the same indexes (range indexes that were applied be default when loading from the CSV files). We do not really care about the indexes. So, we wish to tape the data frames together vertically. We can use the `pd.concat()` function to do this.\n", + "\n", + "Before we do that, though, we might notice a problem. We will not have information to tell us which frog is which. We might therefore like to add a column to each data frame that has the frog ID, and then concatenate them. We can parse the ID of the frog from the file name, as we can see by looking at the file names." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['../data/frog_strikes_I.csv',\n", + " '../data/frog_strikes_II.csv',\n", + " '../data/frog_strikes_III.csv',\n", + " '../data/frog_strikes_IV.csv']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fnames" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So, for each data frame/file name pair, we extract the Roman numeral and add a column to the data frame containing the frog ID." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datetrial numberimpact force (mN)impact time (ms)impact force / body weightadhesive force (mN)time frog pulls on target (ms)adhesive force / body weightadhesive impulse (N-s)total contact area (mm2)contact area without mucus (mm2)contact area with mucus / contact area without mucuscontact pressure (Pa)adhesive strength (Pa)ID
02013_02_2631205461.95-7858841.27-0.290387700.823117-2030I
12013_02_2642527444.08-9832481.59-0.181101940.0724923-9695I
22013_03_0111745342.82-8502111.37-0.15783790.0521020-10239I
32013_03_0121556412.51-45510250.74-0.1703301580.524718-1381I
42013_03_013493360.80-9744991.57-0.4232452160.122012-3975I
\n", + "
" + ], + "text/plain": [ + " date trial number impact force (mN) impact time (ms) \\\n", + "0 2013_02_26 3 1205 46 \n", + "1 2013_02_26 4 2527 44 \n", + "2 2013_03_01 1 1745 34 \n", + "3 2013_03_01 2 1556 41 \n", + "4 2013_03_01 3 493 36 \n", + "\n", + " impact force / body weight adhesive force (mN) \\\n", + "0 1.95 -785 \n", + "1 4.08 -983 \n", + "2 2.82 -850 \n", + "3 2.51 -455 \n", + "4 0.80 -974 \n", + "\n", + " time frog pulls on target (ms) adhesive force / body weight \\\n", + "0 884 1.27 \n", + "1 248 1.59 \n", + "2 211 1.37 \n", + "3 1025 0.74 \n", + "4 499 1.57 \n", + "\n", + " adhesive impulse (N-s) total contact area (mm2) \\\n", + "0 -0.290 387 \n", + "1 -0.181 101 \n", + "2 -0.157 83 \n", + "3 -0.170 330 \n", + "4 -0.423 245 \n", + "\n", + " contact area without mucus (mm2) \\\n", + "0 70 \n", + "1 94 \n", + "2 79 \n", + "3 158 \n", + "4 216 \n", + "\n", + " contact area with mucus / contact area without mucus \\\n", + "0 0.82 \n", + "1 0.07 \n", + "2 0.05 \n", + "3 0.52 \n", + "4 0.12 \n", + "\n", + " contact pressure (Pa) adhesive strength (Pa) ID \n", + "0 3117 -2030 I \n", + "1 24923 -9695 I \n", + "2 21020 -10239 I \n", + "3 4718 -1381 I \n", + "4 2012 -3975 I " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "for i, f in enumerate(fnames):\n", + " frog_id = f[f.rfind('_')+1:f.rfind('.')]\n", + " dfs[i]['ID'] = frog_id\n", + " \n", + "# Take a look\n", + "dfs[0].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Good! Now all data frames have an `'ID'` column, and we can concatenate. The `pd.concat()` function takes as input a list of data frames to be concatenated. Since we do not care about the index, we can use the `ignore_index=True` kwarg." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of rows: 80 \n", + "Unique IDs: ['I' 'II' 'III' 'IV']\n" + ] + } + ], + "source": [ + "df = pd.concat(dfs, ignore_index=True)\n", + "\n", + "# Make sure we got them all\n", + "print('Number of rows:', len(df), '\\nUnique IDs:', df['ID'].unique())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### More advanced concatenation\n", + "\n", + "When we concatenated, we updated each data frame with a fresh column. The `pd.concat()` function can handle some of this for you. If we instead passed a dictionary of data frames instead of a list, it applies the keys to each data frame that is concatenated using a multiindex. First, we'll read in the data frames as a dictionary of data frames instead of a list." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['I', 'II', 'III', 'IV'])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Make dictionary of data frames\n", + "dfs = {\n", + " f[f.rfind(\"_\") + 1 : f.rfind(\".\")]: pd.read_csv(f, comment=\"#\")\n", + " for i, f in enumerate(fnames)\n", + "}\n", + "\n", + "# Verify that keys are in fact IDs\n", + "dfs.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, if we call `pd.concat()` with dictionary input, we get a new data frame with a multiindex." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datetrial numberimpact force (mN)impact time (ms)impact force / body weightadhesive force (mN)time frog pulls on target (ms)adhesive force / body weightadhesive impulse (N-s)total contact area (mm2)contact area without mucus (mm2)contact area with mucus / contact area without mucuscontact pressure (Pa)adhesive strength (Pa)
I02013_02_2631205461.95-7858841.27-0.290387700.823117-2030
12013_02_2642527444.08-9832481.59-0.181101940.0724923-9695
22013_03_0111745342.82-8502111.37-0.15783790.0521020-10239
32013_03_0121556412.51-45510250.74-0.1703301580.524718-1381
42013_03_013493360.80-9744991.57-0.4232452160.122012-3975
\n", + "
" + ], + "text/plain": [ + " date trial number impact force (mN) impact time (ms) \\\n", + "I 0 2013_02_26 3 1205 46 \n", + " 1 2013_02_26 4 2527 44 \n", + " 2 2013_03_01 1 1745 34 \n", + " 3 2013_03_01 2 1556 41 \n", + " 4 2013_03_01 3 493 36 \n", + "\n", + " impact force / body weight adhesive force (mN) \\\n", + "I 0 1.95 -785 \n", + " 1 4.08 -983 \n", + " 2 2.82 -850 \n", + " 3 2.51 -455 \n", + " 4 0.80 -974 \n", + "\n", + " time frog pulls on target (ms) adhesive force / body weight \\\n", + "I 0 884 1.27 \n", + " 1 248 1.59 \n", + " 2 211 1.37 \n", + " 3 1025 0.74 \n", + " 4 499 1.57 \n", + "\n", + " adhesive impulse (N-s) total contact area (mm2) \\\n", + "I 0 -0.290 387 \n", + " 1 -0.181 101 \n", + " 2 -0.157 83 \n", + " 3 -0.170 330 \n", + " 4 -0.423 245 \n", + "\n", + " contact area without mucus (mm2) \\\n", + "I 0 70 \n", + " 1 94 \n", + " 2 79 \n", + " 3 158 \n", + " 4 216 \n", + "\n", + " contact area with mucus / contact area without mucus \\\n", + "I 0 0.82 \n", + " 1 0.07 \n", + " 2 0.05 \n", + " 3 0.52 \n", + " 4 0.12 \n", + "\n", + " contact pressure (Pa) adhesive strength (Pa) \n", + "I 0 3117 -2030 \n", + " 1 24923 -9695 \n", + " 2 21020 -10239 \n", + " 3 4718 -1381 \n", + " 4 2012 -3975 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.concat(dfs)\n", + "\n", + "# Take a look\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have a multiindex for the rows, with the high level index being the ID and the low level index being the original index of the data frame that was concatenated. It is useful to give these indexes names so we can conveniently refer to them. We can do that by setting the `df.index.names` property as\n", + "\n", + "```python\n", + "df.index.names = ['ID', 'original index']\n", + "```\n", + "\n", + "We can instead specify a `names` kwarg when we call `pd.concat()`. This kwarg specifies the names of the resulting multiindex from the concatenation." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datetrial numberimpact force (mN)impact time (ms)impact force / body weightadhesive force (mN)time frog pulls on target (ms)adhesive force / body weightadhesive impulse (N-s)total contact area (mm2)contact area without mucus (mm2)contact area with mucus / contact area without mucuscontact pressure (Pa)adhesive strength (Pa)
IDoriginal index
I02013_02_2631205461.95-7858841.27-0.290387700.823117-2030
12013_02_2642527444.08-9832481.59-0.181101940.0724923-9695
22013_03_0111745342.82-8502111.37-0.15783790.0521020-10239
32013_03_0121556412.51-45510250.74-0.1703301580.524718-1381
42013_03_013493360.80-9744991.57-0.4232452160.122012-3975
\n", + "
" + ], + "text/plain": [ + " date trial number impact force (mN) \\\n", + "ID original index \n", + "I 0 2013_02_26 3 1205 \n", + " 1 2013_02_26 4 2527 \n", + " 2 2013_03_01 1 1745 \n", + " 3 2013_03_01 2 1556 \n", + " 4 2013_03_01 3 493 \n", + "\n", + " impact time (ms) impact force / body weight \\\n", + "ID original index \n", + "I 0 46 1.95 \n", + " 1 44 4.08 \n", + " 2 34 2.82 \n", + " 3 41 2.51 \n", + " 4 36 0.80 \n", + "\n", + " adhesive force (mN) time frog pulls on target (ms) \\\n", + "ID original index \n", + "I 0 -785 884 \n", + " 1 -983 248 \n", + " 2 -850 211 \n", + " 3 -455 1025 \n", + " 4 -974 499 \n", + "\n", + " adhesive force / body weight adhesive impulse (N-s) \\\n", + "ID original index \n", + "I 0 1.27 -0.290 \n", + " 1 1.59 -0.181 \n", + " 2 1.37 -0.157 \n", + " 3 0.74 -0.170 \n", + " 4 1.57 -0.423 \n", + "\n", + " total contact area (mm2) contact area without mucus (mm2) \\\n", + "ID original index \n", + "I 0 387 70 \n", + " 1 101 94 \n", + " 2 83 79 \n", + " 3 330 158 \n", + " 4 245 216 \n", + "\n", + " contact area with mucus / contact area without mucus \\\n", + "ID original index \n", + "I 0 0.82 \n", + " 1 0.07 \n", + " 2 0.05 \n", + " 3 0.52 \n", + " 4 0.12 \n", + "\n", + " contact pressure (Pa) adhesive strength (Pa) \n", + "ID original index \n", + "I 0 3117 -2030 \n", + " 1 24923 -9695 \n", + " 2 21020 -10239 \n", + " 3 4718 -1381 \n", + " 4 2012 -3975 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.concat(dfs, names=['ID', 'original index'])\n", + "\n", + "# Take a look\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We conveniently have labeled indexes, and we can now make `ID` a column in the data frame using the `reset_index()` method." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDoriginal indexdatetrial numberimpact force (mN)impact time (ms)impact force / body weightadhesive force (mN)time frog pulls on target (ms)adhesive force / body weightadhesive impulse (N-s)total contact area (mm2)contact area without mucus (mm2)contact area with mucus / contact area without mucuscontact pressure (Pa)adhesive strength (Pa)
0I02013_02_2631205461.95-7858841.27-0.290387700.823117-2030
1I12013_02_2642527444.08-9832481.59-0.181101940.0724923-9695
2I22013_03_0111745342.82-8502111.37-0.15783790.0521020-10239
3I32013_03_0121556412.51-45510250.74-0.1703301580.524718-1381
4I42013_03_013493360.80-9744991.57-0.4232452160.122012-3975
\n", + "
" + ], + "text/plain": [ + " ID original index date trial number impact force (mN) \\\n", + "0 I 0 2013_02_26 3 1205 \n", + "1 I 1 2013_02_26 4 2527 \n", + "2 I 2 2013_03_01 1 1745 \n", + "3 I 3 2013_03_01 2 1556 \n", + "4 I 4 2013_03_01 3 493 \n", + "\n", + " impact time (ms) impact force / body weight adhesive force (mN) \\\n", + "0 46 1.95 -785 \n", + "1 44 4.08 -983 \n", + "2 34 2.82 -850 \n", + "3 41 2.51 -455 \n", + "4 36 0.80 -974 \n", + "\n", + " time frog pulls on target (ms) adhesive force / body weight \\\n", + "0 884 1.27 \n", + "1 248 1.59 \n", + "2 211 1.37 \n", + "3 1025 0.74 \n", + "4 499 1.57 \n", + "\n", + " adhesive impulse (N-s) total contact area (mm2) \\\n", + "0 -0.290 387 \n", + "1 -0.181 101 \n", + "2 -0.157 83 \n", + "3 -0.170 330 \n", + "4 -0.423 245 \n", + "\n", + " contact area without mucus (mm2) \\\n", + "0 70 \n", + "1 94 \n", + "2 79 \n", + "3 158 \n", + "4 216 \n", + "\n", + " contact area with mucus / contact area without mucus \\\n", + "0 0.82 \n", + "1 0.07 \n", + "2 0.05 \n", + "3 0.52 \n", + "4 0.12 \n", + "\n", + " contact pressure (Pa) adhesive strength (Pa) \n", + "0 3117 -2030 \n", + "1 24923 -9695 \n", + "2 21020 -10239 \n", + "3 4718 -1381 \n", + "4 2012 -3975 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.reset_index()\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We now have a default range index for the data frame that we do not care about. Because the original index was not informative either, we can delete that column if we like, but it is not really a burden to have an unused column laying around in a data set this small. Nonetheless, let's blow it away." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDdatetrial numberimpact force (mN)impact time (ms)impact force / body weightadhesive force (mN)time frog pulls on target (ms)adhesive force / body weightadhesive impulse (N-s)total contact area (mm2)contact area without mucus (mm2)contact area with mucus / contact area without mucuscontact pressure (Pa)adhesive strength (Pa)
0I2013_02_2631205461.95-7858841.27-0.290387700.823117-2030
1I2013_02_2642527444.08-9832481.59-0.181101940.0724923-9695
2I2013_03_0111745342.82-8502111.37-0.15783790.0521020-10239
3I2013_03_0121556412.51-45510250.74-0.1703301580.524718-1381
4I2013_03_013493360.80-9744991.57-0.4232452160.122012-3975
\n", + "
" + ], + "text/plain": [ + " ID date trial number impact force (mN) impact time (ms) \\\n", + "0 I 2013_02_26 3 1205 46 \n", + "1 I 2013_02_26 4 2527 44 \n", + "2 I 2013_03_01 1 1745 34 \n", + "3 I 2013_03_01 2 1556 41 \n", + "4 I 2013_03_01 3 493 36 \n", + "\n", + " impact force / body weight adhesive force (mN) \\\n", + "0 1.95 -785 \n", + "1 4.08 -983 \n", + "2 2.82 -850 \n", + "3 2.51 -455 \n", + "4 0.80 -974 \n", + "\n", + " time frog pulls on target (ms) adhesive force / body weight \\\n", + "0 884 1.27 \n", + "1 248 1.59 \n", + "2 211 1.37 \n", + "3 1025 0.74 \n", + "4 499 1.57 \n", + "\n", + " adhesive impulse (N-s) total contact area (mm2) \\\n", + "0 -0.290 387 \n", + "1 -0.181 101 \n", + "2 -0.157 83 \n", + "3 -0.170 330 \n", + "4 -0.423 245 \n", + "\n", + " contact area without mucus (mm2) \\\n", + "0 70 \n", + "1 94 \n", + "2 79 \n", + "3 158 \n", + "4 216 \n", + "\n", + " contact area with mucus / contact area without mucus \\\n", + "0 0.82 \n", + "1 0.07 \n", + "2 0.05 \n", + "3 0.52 \n", + "4 0.12 \n", + "\n", + " contact pressure (Pa) adhesive strength (Pa) \n", + "0 3117 -2030 \n", + "1 24923 -9695 \n", + "2 21020 -10239 \n", + "3 4718 -1381 \n", + "4 2012 -3975 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "del df['original index']\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We now have a nice, tidy data frame!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating a DataFrame from scratch\n", + "\n", + "Looking back at the [headers of the original data files](#The-data-files), we see that there is information present in the header that we would like to have in our data frame. For example, it would be nice to know if each strike came from an adult or juvenile. Or what the snout-vent length was. Working toward the goal of including this in our data frame, we will first construct a new data frame containing information about each frog." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data frames from dictionaries\n", + "\n", + "One way do create this new data frame is to first construct a dictionary with the respective fields. Since these data sets are small, we can look at the files and make the dictionary by hand." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "data_dict = {\n", + " \"ID\": [\"I\", \"II\", \"III\", \"IV\"],\n", + " \"age\": [\"adult\", \"adult\", \"juvenile\", \"juvenile\"],\n", + " \"SVL (mm)\": [63, 70, 28, 31],\n", + " \"body weight (g)\": [63.1, 72.7, 12.7, 12.7],\n", + " \"species\": [\"cross\", \"cross\", \"cranwelli\", \"cranwelli\"],\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have this dictionary, we can convert it into a `DataFrame` by instantiating a `pd.DataFrame` class with it, using the `data` kwarg." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDageSVL (mm)body weight (g)species
0Iadult6363.1cross
1IIadult7072.7cross
2IIIjuvenile2812.7cranwelli
3IVjuvenile3112.7cranwelli
\n", + "
" + ], + "text/plain": [ + " ID age SVL (mm) body weight (g) species\n", + "0 I adult 63 63.1 cross\n", + "1 II adult 70 72.7 cross\n", + "2 III juvenile 28 12.7 cranwelli\n", + "3 IV juvenile 31 12.7 cranwelli" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Make it into a DataFrame\n", + "df_frog_info = pd.DataFrame(data=data_dict)\n", + "\n", + "# Take a look\n", + "df_frog_info" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Nice!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data frames from numpy arrays\n", + "\n", + "Sometimes the data sets are not small enough to construct a dictionary by hand. Oftentimes, we have a two-dimensional array of data that we want to make into a `DataFrame`. As an example, let's say we have a Numpy array where the first column is snout vent length and the second is weight." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[63. , 63.1],\n", + " [70. , 72.7],\n", + " [28. , 12.7],\n", + " [31. , 12.7]])" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = np.array([[63, 70, 28, 31], [63.1, 72.7, 12.7, 12.7]]).transpose()\n", + "\n", + "# Verify that it's what we think it is\n", + "data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To make this into a `DataFrame`, we again create `pd.DataFrame` instance, but this time we also specify the `columns` keyword argument." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SVL (mm)weight (g)
063.063.1
170.072.7
228.012.7
331.012.7
\n", + "
" + ], + "text/plain": [ + " SVL (mm) weight (g)\n", + "0 63.0 63.1\n", + "1 70.0 72.7\n", + "2 28.0 12.7\n", + "3 31.0 12.7" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_demo = pd.DataFrame(data=data, columns=[\"SVL (mm)\", \"weight (g)\"])\n", + "\n", + "# Take a look\n", + "df_demo" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That also works. Generally, any two-dimensional Numpy array can be converted into a `DataFrame` in this way. You just need to supply column names." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Programmatically creating a data frame\n", + "\n", + "Hand-entering data should be minimized. The information about each frog were hand-entered once by the experimenter. We should not hand-enter them again. We therefore should parse the comment lines of input files to get the pertinent information.\n", + "\n", + "Note, though, that in the case of a single experiment with only four data sets, hand entering might be faster and indeed less error prone than doing it programmatically. We should definitely do it programmatically if we have a large number of data files or will ever do an experiment with the same file format again.\n", + "\n", + "So, let's programmatically parse the files. We start by writing a function to parse the metadata from a single file. Recall that the comment lines look like this:\n", + "\n", + "```\n", + "# These data are from Kleinteich and Gorb, Sci. Rep., 4, 5225, 2014.\n", + "# Frog ID: I\n", + "# Age: adult\n", + "# Snout-vent-length (SVL): 63 mm\n", + "# Body weight: 63.1 g\n", + "# Species: Ceratophrys cranwelli crossed with Ceratophrys cornuta\n", + "```\n", + "\n", + "(The function below will not work with Colab because `open()` does not work for files specified by a URL.)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "def parse_frog_metadata(fname):\n", + " with open(fname, 'r') as f:\n", + " # Citation line, ignore.\n", + " f.readline()\n", + " \n", + " # Frog ID\n", + " line = f.readline()\n", + " frog_id = line[line.find(':')+1:].strip()\n", + " \n", + " # Age\n", + " line = f.readline()\n", + " age = line[line.find(':')+1:].strip()\n", + " \n", + " # SVL, assume units given as mm\n", + " line = f.readline()\n", + " svl = line[line.find(':')+1:line.rfind(' ')].strip()\n", + " \n", + " # Body weight, assume units given as g\n", + " line = f.readline()\n", + " body_weight = line[line.find(':')+1:line.rfind(' ')].strip()\n", + "\n", + " # Species (either cranwelli or cross)\n", + " line = f.readline()\n", + " species = line[line.find(':')+1:].strip()\n", + " if 'cross' in species:\n", + " species = 'cross'\n", + " else:\n", + " species = 'cranwelli'\n", + "\n", + " return frog_id, age, svl, body_weight, species" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take it for a spin." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('I', 'adult', '63', '63.1', 'cross')" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "parse_frog_metadata(os.path.join(data_path, 'frog_strikes_I.csv'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looks good! Now we can create a list of tuples to use as data for making a data frame." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('I', 'adult', '63', '63.1', 'cross'),\n", + " ('II', 'adult', '70', '72.7', 'cross'),\n", + " ('III', 'juvenile', '28', '12.7', 'cranwelli'),\n", + " ('IV', 'juvenile', '31', '12.7', 'cranwelli')]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = [parse_frog_metadata(f) for f in fnames]\n", + " \n", + "# Take a look\n", + "data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We now input this list of tuples, plus the column names, into `pd.DataFrame()`, and we've got our data frame." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDageSVL (mm)body weight (g)species
0Iadult6363.1cross
1IIadult7072.7cross
2IIIjuvenile2812.7cranwelli
3IVjuvenile3112.7cranwelli
\n", + "
" + ], + "text/plain": [ + " ID age SVL (mm) body weight (g) species\n", + "0 I adult 63 63.1 cross\n", + "1 II adult 70 72.7 cross\n", + "2 III juvenile 28 12.7 cranwelli\n", + "3 IV juvenile 31 12.7 cranwelli" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_frog_info = pd.DataFrame(\n", + " data=data, \n", + " columns=[\"ID\", \"age\", \"SVL (mm)\", \"body weight (g)\", \"species\"]\n", + ")\n", + "\n", + "# Take a look\n", + "df_frog_info" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Merging DataFrames\n", + "\n", + "Our ultimate goal is to add the information about the frogs into our main data frame, `df`, that we have been working with. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Brute force merge\n", + "\n", + "We can do it using tools we have already learned. For each row in the `DataFrame`, we can add the relevant value in each column. Because this will not be the final way I recommend doing this, I will do these operations on a copy of `df` using the `copy()` method." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDdatetrial numberimpact force (mN)impact time (ms)impact force / body weightadhesive force (mN)time frog pulls on target (ms)adhesive force / body weightadhesive impulse (N-s)total contact area (mm2)contact area without mucus (mm2)contact area with mucus / contact area without mucuscontact pressure (Pa)adhesive strength (Pa)ageSVL (mm)body weight (g)species
0I2013_02_2631205461.95-7858841.27-0.290387700.823117-2030adult6363.1cross
1I2013_02_2642527444.08-9832481.59-0.181101940.0724923-9695adult6363.1cross
2I2013_03_0111745342.82-8502111.37-0.15783790.0521020-10239adult6363.1cross
3I2013_03_0121556412.51-45510250.74-0.1703301580.524718-1381adult6363.1cross
4I2013_03_013493360.80-9744991.57-0.4232452160.122012-3975adult6363.1cross
\n", + "
" + ], + "text/plain": [ + " ID date trial number impact force (mN) impact time (ms) \\\n", + "0 I 2013_02_26 3 1205 46 \n", + "1 I 2013_02_26 4 2527 44 \n", + "2 I 2013_03_01 1 1745 34 \n", + "3 I 2013_03_01 2 1556 41 \n", + "4 I 2013_03_01 3 493 36 \n", + "\n", + " impact force / body weight adhesive force (mN) \\\n", + "0 1.95 -785 \n", + "1 4.08 -983 \n", + "2 2.82 -850 \n", + "3 2.51 -455 \n", + "4 0.80 -974 \n", + "\n", + " time frog pulls on target (ms) adhesive force / body weight \\\n", + "0 884 1.27 \n", + "1 248 1.59 \n", + "2 211 1.37 \n", + "3 1025 0.74 \n", + "4 499 1.57 \n", + "\n", + " adhesive impulse (N-s) total contact area (mm2) \\\n", + "0 -0.290 387 \n", + "1 -0.181 101 \n", + "2 -0.157 83 \n", + "3 -0.170 330 \n", + "4 -0.423 245 \n", + "\n", + " contact area without mucus (mm2) \\\n", + "0 70 \n", + "1 94 \n", + "2 79 \n", + "3 158 \n", + "4 216 \n", + "\n", + " contact area with mucus / contact area without mucus \\\n", + "0 0.82 \n", + "1 0.07 \n", + "2 0.05 \n", + "3 0.52 \n", + "4 0.12 \n", + "\n", + " contact pressure (Pa) adhesive strength (Pa) age SVL (mm) \\\n", + "0 3117 -2030 adult 63 \n", + "1 24923 -9695 adult 63 \n", + "2 21020 -10239 adult 63 \n", + "3 4718 -1381 adult 63 \n", + "4 2012 -3975 adult 63 \n", + "\n", + " body weight (g) species \n", + "0 63.1 cross \n", + "1 63.1 cross \n", + "2 63.1 cross \n", + "3 63.1 cross \n", + "4 63.1 cross " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Make a copy of df\n", + "df_copy = df.copy()\n", + "\n", + "# Build each column\n", + "for col in df_frog_info.columns[df_frog_info.columns != 'ID']:\n", + " # Make a new column with empty values\n", + " df_copy[col] = np.empty(len(df_copy))\n", + " \n", + " # Add in each entry, row by row\n", + " for i, r in df_copy.iterrows():\n", + " ind = df_frog_info['ID'] == r['ID']\n", + " df_copy.loc[i, col] = df_frog_info.loc[ind, col].iloc[0]\n", + " \n", + "# Take a look at the updated DataFrame\n", + "df_copy.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that I used the `iterrows()` method of the `df_copy` data frame. This iterator gives an index (which I called `i`) and a row of a data frame (which I called `r`). This method, and the analogous one for iterating over columns, `iteritems()`, can be useful.\n", + "\n", + "But this approach seems rather clunky. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using pd.merge()\n", + "\n", + "There is a much better way to do it is to use Pandas's [built-in merge() method](https://pandas.pydata.org/pandas-docs/stable/merging.html#database-style-dataframe-joining-merging). Called with all the default keyword arguments, this function finds common columns between two `DataFrame`s (in this case, there is just one, the `ID` column), and then uses those columns to merge them, filling in values that match in the common columns. This is exactly what we want." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDdatetrial numberimpact force (mN)impact time (ms)impact force / body weightadhesive force (mN)time frog pulls on target (ms)adhesive force / body weightadhesive impulse (N-s)total contact area (mm2)contact area without mucus (mm2)contact area with mucus / contact area without mucuscontact pressure (Pa)adhesive strength (Pa)ageSVL (mm)body weight (g)species
0I2013_02_2631205461.95-7858841.27-0.290387700.823117-2030adult6363.1cross
1I2013_02_2642527444.08-9832481.59-0.181101940.0724923-9695adult6363.1cross
2I2013_03_0111745342.82-8502111.37-0.15783790.0521020-10239adult6363.1cross
3I2013_03_0121556412.51-45510250.74-0.1703301580.524718-1381adult6363.1cross
4I2013_03_013493360.80-9744991.57-0.4232452160.122012-3975adult6363.1cross
\n", + "
" + ], + "text/plain": [ + " ID date trial number impact force (mN) impact time (ms) \\\n", + "0 I 2013_02_26 3 1205 46 \n", + "1 I 2013_02_26 4 2527 44 \n", + "2 I 2013_03_01 1 1745 34 \n", + "3 I 2013_03_01 2 1556 41 \n", + "4 I 2013_03_01 3 493 36 \n", + "\n", + " impact force / body weight adhesive force (mN) \\\n", + "0 1.95 -785 \n", + "1 4.08 -983 \n", + "2 2.82 -850 \n", + "3 2.51 -455 \n", + "4 0.80 -974 \n", + "\n", + " time frog pulls on target (ms) adhesive force / body weight \\\n", + "0 884 1.27 \n", + "1 248 1.59 \n", + "2 211 1.37 \n", + "3 1025 0.74 \n", + "4 499 1.57 \n", + "\n", + " adhesive impulse (N-s) total contact area (mm2) \\\n", + "0 -0.290 387 \n", + "1 -0.181 101 \n", + "2 -0.157 83 \n", + "3 -0.170 330 \n", + "4 -0.423 245 \n", + "\n", + " contact area without mucus (mm2) \\\n", + "0 70 \n", + "1 94 \n", + "2 79 \n", + "3 158 \n", + "4 216 \n", + "\n", + " contact area with mucus / contact area without mucus \\\n", + "0 0.82 \n", + "1 0.07 \n", + "2 0.05 \n", + "3 0.52 \n", + "4 0.12 \n", + "\n", + " contact pressure (Pa) adhesive strength (Pa) age SVL (mm) \\\n", + "0 3117 -2030 adult 63 \n", + "1 24923 -9695 adult 63 \n", + "2 21020 -10239 adult 63 \n", + "3 4718 -1381 adult 63 \n", + "4 2012 -3975 adult 63 \n", + "\n", + " body weight (g) species \n", + "0 63.1 cross \n", + "1 63.1 cross \n", + "2 63.1 cross \n", + "3 63.1 cross \n", + "4 63.1 cross " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.merge(df_frog_info)\n", + "\n", + "# Check it out!\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that the entries for the added columns were repeated appropriately, e.g., body weight column had 63 for every row corresponding to frog I. \n", + "\n", + "I think this example of merging `DataFrame`s highlights the power of using them in your data analysis. Note also that there are plenty of options for how merges are done, and you should consult the [Pandas documentation](https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html#database-style-dataframe-or-named-series-joining-merging).\n", + "\n", + "This example also brings up an important point. When you have to perform operations on data frames, you can often \"brute force\" it with loops, etc. But if what you are trying to do seems like something a data analyst would frequently encounter, there is a good chance it's already built-in to Pandas, and you should ask Google how to do it." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## At long last, a plot!\n", + "\n", + "While the purpose of this part of the lesson was to learn how to concatenate and merge data frames, going through all of that wrangling effort would somehow be unsatisfying if we we didn't generate a plot. Let's compare the impact force on a per-mass basis for each frog." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " const docs_json = {\"d5db22b2-971e-4bb2-b214-ca6c42cee39b\":{\"version\":\"3.2.1\",\"title\":\"Bokeh Application\",\"roots\":[{\"type\":\"object\",\"name\":\"Figure\",\"id\":\"p1002\",\"attributes\":{\"x_range\":{\"type\":\"object\",\"name\":\"DataRange1d\",\"id\":\"p1004\"},\"y_range\":{\"type\":\"object\",\"name\":\"FactorRange\",\"id\":\"p1001\",\"attributes\":{\"factors\":[\"IV\",\"III\",\"II\",\"I\"]}},\"x_scale\":{\"type\":\"object\",\"name\":\"LinearScale\",\"id\":\"p1011\"},\"y_scale\":{\"type\":\"object\",\"name\":\"CategoricalScale\",\"id\":\"p1012\"},\"title\":{\"type\":\"object\",\"name\":\"Title\",\"id\":\"p1009\"},\"renderers\":[{\"type\":\"object\",\"name\":\"GlyphRenderer\",\"id\":\"p1040\",\"attributes\":{\"name\":\"hover_glyphs\",\"data_source\":{\"type\":\"object\",\"name\":\"ColumnDataSource\",\"id\":\"p1031\",\"attributes\":{\"selected\":{\"type\":\"object\",\"name\":\"Selection\",\"id\":\"p1032\",\"attributes\":{\"indices\":[],\"line_indices\":[]}},\"selection_policy\":{\"type\":\"object\",\"name\":\"UnionRenderers\",\"id\":\"p1033\"},\"data\":{\"type\":\"map\",\"entries\":[[\"index\",{\"type\":\"ndarray\",\"array\":{\"type\":\"bytes\",\"data\":\"AAAAAAEAAAACAAAAAwAAAAQAAAAFAAAABgAAAAcAAAAIAAAACQAAAAoAAAALAAAADAAAAA0AAAAOAAAADwAAABAAAAARAAAAEgAAABMAAAAUAAAAFQAAABYAAAAXAAAAGAAAABkAAAAaAAAAGwAAABwAAAAdAAAAHgAAAB8AAAAgAAAAIQAAACIAAAAjAAAAJAAAACUAAAAmAAAAJwAAAA==\"},\"shape\":[40],\"dtype\":\"int32\",\"order\":\"little\"}],[\"age\",{\"type\":\"ndarray\",\"array\":[\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\"],\"shape\":[40],\"dtype\":\"object\",\"order\":\"little\"}],[\"impact force / body weight\",{\"type\":\"ndarray\",\"array\":{\"type\":\"bytes\",\"data\":\"MzMzMzMz/z9SuB6F61EQQI/C9ShcjwZAFK5H4XoUBECamZmZmZnpP3E9CtejcA1AzczMzMzM7D/hehSuR+EIQBSuR+F6FBFAexSuR+F6CEB7FK5H4XoIQAAAAAAAAARA4XoUrkfhAEDXo3A9CtcFQOxRuB6F6wNAj8L1KFyPAEAAAAAAAAD0Pz0K16NwPQpAAAAAAAAAAEBSuB6F61HoP1K4HoXrUQ5AuB6F61G49j+kcD0K16PoP9ejcD0K1wFAUrgehetR9D/2KFyPwvUMQNejcD0K1/M/rkfhehSu9z9cj8L1KFwLQGZmZmZmZuY/ZmZmZmZm+j8pXI/C9SjkP1K4HoXrUfw/j8L1KFyP4j89CtejcD0GQFyPwvUoXPM/UrgehetR8D/NzMzMzMzsP0jhehSuR/E/hetRuB6F+z8=\"},\"shape\":[40],\"dtype\":\"float64\",\"order\":\"little\"}],[\"ID\",{\"type\":\"ndarray\",\"array\":[\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\"],\"shape\":[40],\"dtype\":\"object\",\"order\":\"little\"}],[\"cat\",{\"type\":\"ndarray\",\"array\":[\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"I\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\",\"II\"],\"shape\":[40],\"dtype\":\"object\",\"order\":\"little\"}],[\"__label\",{\"type\":\"ndarray\",\"array\":[\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\",\"adult\"],\"shape\":[40],\"dtype\":\"object\",\"order\":\"little\"}]]}}},\"view\":{\"type\":\"object\",\"name\":\"CDSView\",\"id\":\"p1041\",\"attributes\":{\"filter\":{\"type\":\"object\",\"name\":\"AllIndices\",\"id\":\"p1042\"}}},\"glyph\":{\"type\":\"object\",\"name\":\"Circle\",\"id\":\"p1037\",\"attributes\":{\"x\":{\"type\":\"field\",\"field\":\"impact force / body weight\"},\"y\":{\"type\":\"field\",\"field\":\"cat\",\"transform\":{\"type\":\"object\",\"name\":\"Jitter\",\"id\":\"p1030\",\"attributes\":{\"width\":0.1,\"distribution\":\"normal\",\"range\":{\"id\":\"p1001\"}}}},\"line_color\":{\"type\":\"value\",\"value\":\"#1f77b3\"},\"fill_color\":{\"type\":\"value\",\"value\":\"#1f77b3\"},\"hatch_color\":{\"type\":\"value\",\"value\":\"#1f77b3\"}}},\"nonselection_glyph\":{\"type\":\"object\",\"name\":\"Circle\",\"id\":\"p1038\",\"attributes\":{\"x\":{\"type\":\"field\",\"field\":\"impact force / body weight\"},\"y\":{\"type\":\"field\",\"field\":\"cat\",\"transform\":{\"id\":\"p1030\"}},\"line_color\":{\"type\":\"value\",\"value\":\"#1f77b3\"},\"line_alpha\":{\"type\":\"value\",\"value\":0.1},\"fill_color\":{\"type\":\"value\",\"value\":\"#1f77b3\"},\"fill_alpha\":{\"type\":\"value\",\"value\":0.1},\"hatch_color\":{\"type\":\"value\",\"value\":\"#1f77b3\"},\"hatch_alpha\":{\"type\":\"value\",\"value\":0.1}}},\"muted_glyph\":{\"type\":\"object\",\"name\":\"Circle\",\"id\":\"p1039\",\"attributes\":{\"x\":{\"type\":\"field\",\"field\":\"impact force / body weight\"},\"y\":{\"type\":\"field\",\"field\":\"cat\",\"transform\":{\"id\":\"p1030\"}},\"line_color\":{\"type\":\"value\",\"value\":\"#1f77b3\"},\"line_alpha\":{\"type\":\"value\",\"value\":0.2},\"fill_color\":{\"type\":\"value\",\"value\":\"#1f77b3\"},\"fill_alpha\":{\"type\":\"value\",\"value\":0.2},\"hatch_color\":{\"type\":\"value\",\"value\":\"#1f77b3\"},\"hatch_alpha\":{\"type\":\"value\",\"value\":0.2}}}}},{\"type\":\"object\",\"name\":\"GlyphRenderer\",\"id\":\"p1052\",\"attributes\":{\"name\":\"hover_glyphs\",\"data_source\":{\"type\":\"object\",\"name\":\"ColumnDataSource\",\"id\":\"p1043\",\"attributes\":{\"selected\":{\"type\":\"object\",\"name\":\"Selection\",\"id\":\"p1044\",\"attributes\":{\"indices\":[],\"line_indices\":[]}},\"selection_policy\":{\"type\":\"object\",\"name\":\"UnionRenderers\",\"id\":\"p1045\"},\"data\":{\"type\":\"map\",\"entries\":[[\"index\",{\"type\":\"ndarray\",\"array\":{\"type\":\"bytes\",\"data\":\"KAAAACkAAAAqAAAAKwAAACwAAAAtAAAALgAAAC8AAAAwAAAAMQAAADIAAAAzAAAANAAAADUAAAA2AAAANwAAADgAAAA5AAAAOgAAADsAAAA8AAAAPQAAAD4AAAA/AAAAQAAAAEEAAABCAAAAQwAAAEQAAABFAAAARgAAAEcAAABIAAAASQAAAEoAAABLAAAATAAAAE0AAABOAAAATwAAAA==\"},\"shape\":[40],\"dtype\":\"int32\",\"order\":\"little\"}],[\"age\",{\"type\":\"ndarray\",\"array\":[\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\"],\"shape\":[40],\"dtype\":\"object\",\"order\":\"little\"}],[\"impact force / body weight\",{\"type\":\"ndarray\",\"array\":{\"type\":\"bytes\",\"data\":\"w/UoXI/CE0CkcD0K16MKQOF6FK5H4QRA9ihcj8L1GECkcD0K16MTQIXrUbgehRFASOF6FK5HEUCkcD0K16MSQPYoXI/C9RlAmpmZmZmZDUCF61G4HoUNQClcj8L1KBRA9ihcj8L1E0CF61G4HoURQDMzMzMzMxFAuB6F61G4CEDXo3A9CtcJQMP1KFyPwhNAZmZmZmZmFUBxPQrXo3APQHsUrkfhevQ/zczMzMzM8D/sUbgehevRP/YoXI/C9QpAH4XrUbgeBUDD9Shcj8LFP+xRuB6F6w1APQrXo3A9AEBxPQrXo3AVQFyPwvUoXBFAhetRuB6F9z+F61G4HoX3P8P1KFyPwhFAuB6F61G4DkBI4XoUrkcYQAAAAAAAAAhAAAAAAAAAEkAzMzMzMzMVQEjhehSuRxJA16NwPQrXC0A=\"},\"shape\":[40],\"dtype\":\"float64\",\"order\":\"little\"}],[\"ID\",{\"type\":\"ndarray\",\"array\":[\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\"],\"shape\":[40],\"dtype\":\"object\",\"order\":\"little\"}],[\"cat\",{\"type\":\"ndarray\",\"array\":[\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"III\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\",\"IV\"],\"shape\":[40],\"dtype\":\"object\",\"order\":\"little\"}],[\"__label\",{\"type\":\"ndarray\",\"array\":[\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\",\"juvenile\"],\"shape\":[40],\"dtype\":\"object\",\"order\":\"little\"}]]}}},\"view\":{\"type\":\"object\",\"name\":\"CDSView\",\"id\":\"p1053\",\"attributes\":{\"filter\":{\"type\":\"object\",\"name\":\"AllIndices\",\"id\":\"p1054\"}}},\"glyph\":{\"type\":\"object\",\"name\":\"Circle\",\"id\":\"p1049\",\"attributes\":{\"x\":{\"type\":\"field\",\"field\":\"impact force / body weight\"},\"y\":{\"type\":\"field\",\"field\":\"cat\",\"transform\":{\"id\":\"p1030\"}},\"line_color\":{\"type\":\"value\",\"value\":\"#ff7e0e\"},\"fill_color\":{\"type\":\"value\",\"value\":\"#ff7e0e\"},\"hatch_color\":{\"type\":\"value\",\"value\":\"#ff7e0e\"}}},\"nonselection_glyph\":{\"type\":\"object\",\"name\":\"Circle\",\"id\":\"p1050\",\"attributes\":{\"x\":{\"type\":\"field\",\"field\":\"impact force / body weight\"},\"y\":{\"type\":\"field\",\"field\":\"cat\",\"transform\":{\"id\":\"p1030\"}},\"line_color\":{\"type\":\"value\",\"value\":\"#ff7e0e\"},\"line_alpha\":{\"type\":\"value\",\"value\":0.1},\"fill_color\":{\"type\":\"value\",\"value\":\"#ff7e0e\"},\"fill_alpha\":{\"type\":\"value\",\"value\":0.1},\"hatch_color\":{\"type\":\"value\",\"value\":\"#ff7e0e\"},\"hatch_alpha\":{\"type\":\"value\",\"value\":0.1}}},\"muted_glyph\":{\"type\":\"object\",\"name\":\"Circle\",\"id\":\"p1051\",\"attributes\":{\"x\":{\"type\":\"field\",\"field\":\"impact force / body weight\"},\"y\":{\"type\":\"field\",\"field\":\"cat\",\"transform\":{\"id\":\"p1030\"}},\"line_color\":{\"type\":\"value\",\"value\":\"#ff7e0e\"},\"line_alpha\":{\"type\":\"value\",\"value\":0.2},\"fill_color\":{\"type\":\"value\",\"value\":\"#ff7e0e\"},\"fill_alpha\":{\"type\":\"value\",\"value\":0.2},\"hatch_color\":{\"type\":\"value\",\"value\":\"#ff7e0e\"},\"hatch_alpha\":{\"type\":\"value\",\"value\":0.2}}}}}],\"toolbar\":{\"type\":\"object\",\"name\":\"Toolbar\",\"id\":\"p1010\",\"attributes\":{\"tools\":[{\"type\":\"object\",\"name\":\"PanTool\",\"id\":\"p1023\"},{\"type\":\"object\",\"name\":\"WheelZoomTool\",\"id\":\"p1024\"},{\"type\":\"object\",\"name\":\"BoxZoomTool\",\"id\":\"p1025\",\"attributes\":{\"overlay\":{\"type\":\"object\",\"name\":\"BoxAnnotation\",\"id\":\"p1026\",\"attributes\":{\"syncable\":false,\"level\":\"overlay\",\"visible\":false,\"left_units\":\"canvas\",\"right_units\":\"canvas\",\"bottom_units\":\"canvas\",\"top_units\":\"canvas\",\"line_color\":\"black\",\"line_alpha\":1.0,\"line_width\":2,\"line_dash\":[4,4],\"fill_color\":\"lightgrey\",\"fill_alpha\":0.5}}}},{\"type\":\"object\",\"name\":\"SaveTool\",\"id\":\"p1027\"},{\"type\":\"object\",\"name\":\"ResetTool\",\"id\":\"p1028\"},{\"type\":\"object\",\"name\":\"HelpTool\",\"id\":\"p1029\"}]}},\"toolbar_location\":\"above\",\"left\":[{\"type\":\"object\",\"name\":\"CategoricalAxis\",\"id\":\"p1018\",\"attributes\":{\"ticker\":{\"type\":\"object\",\"name\":\"CategoricalTicker\",\"id\":\"p1019\"},\"formatter\":{\"type\":\"object\",\"name\":\"CategoricalTickFormatter\",\"id\":\"p1020\"},\"axis_label\":\"frog ID\",\"major_label_policy\":{\"type\":\"object\",\"name\":\"AllLabels\",\"id\":\"p1021\"}}}],\"right\":[{\"type\":\"object\",\"name\":\"Legend\",\"id\":\"p1055\",\"attributes\":{\"location\":\"center\",\"title\":\"age\",\"click_policy\":\"hide\",\"items\":[{\"type\":\"object\",\"name\":\"LegendItem\",\"id\":\"p1056\",\"attributes\":{\"label\":{\"type\":\"value\",\"value\":\"adult\"},\"renderers\":[{\"id\":\"p1040\"}]}},{\"type\":\"object\",\"name\":\"LegendItem\",\"id\":\"p1057\",\"attributes\":{\"label\":{\"type\":\"value\",\"value\":\"juvenile\"},\"renderers\":[{\"id\":\"p1052\"}]}}]}}],\"below\":[{\"type\":\"object\",\"name\":\"LinearAxis\",\"id\":\"p1013\",\"attributes\":{\"ticker\":{\"type\":\"object\",\"name\":\"BasicTicker\",\"id\":\"p1014\",\"attributes\":{\"mantissas\":[1,2,5]}},\"formatter\":{\"type\":\"object\",\"name\":\"BasicTickFormatter\",\"id\":\"p1015\"},\"axis_label\":\"impact force / body weight (mN/g)\",\"major_label_policy\":{\"type\":\"object\",\"name\":\"AllLabels\",\"id\":\"p1016\"}}}],\"center\":[{\"type\":\"object\",\"name\":\"Grid\",\"id\":\"p1017\",\"attributes\":{\"axis\":{\"id\":\"p1013\"}}},{\"type\":\"object\",\"name\":\"Grid\",\"id\":\"p1022\",\"attributes\":{\"dimension\":1,\"axis\":{\"id\":\"p1018\"},\"grid_line_color\":null}}],\"frame_width\":375,\"frame_height\":275}}]}};\n", + " const render_items = [{\"docid\":\"d5db22b2-971e-4bb2-b214-ca6c42cee39b\",\"roots\":{\"p1002\":\"d7e59000-dcd9-44e4-87f0-ad254f913e20\"},\"root_ids\":[\"p1002\"]}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " let attempts = 0;\n", + " const timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "p1002" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "p = iqplot.strip(\n", + " df,\n", + " q=\"impact force / body weight\",\n", + " cats=\"ID\",\n", + " color_column=\"age\",\n", + " spread=\"jitter\",\n", + " x_axis_label=\"impact force / body weight (mN/g)\",\n", + " y_axis_label=\"frog ID\"\n", + ")\n", + "\n", + "bokeh.io.show(p)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Apparently Frog III consistently packs a powerful punch, er.... tongue." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Computing environment" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Python implementation: CPython\n", + "Python version : 3.11.5\n", + "IPython version : 8.15.0\n", + "\n", + "numpy : 1.24.3\n", + "pandas : 2.0.3\n", + "bokeh : 3.2.1\n", + "iqplot : 0.3.5\n", + "jupyterlab: 4.0.6\n", + "\n" + ] + } + ], + "source": [ + "%load_ext watermark\n", + "%watermark -v -p numpy,pandas,bokeh,iqplot,jupyterlab" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/2024/_sources/lessons/stacking_and_unstacking.ipynb.txt b/2024/_sources/lessons/stacking_and_unstacking.ipynb.txt new file mode 100644 index 00000000..1c196bf1 --- /dev/null +++ b/2024/_sources/lessons/stacking_and_unstacking.ipynb.txt @@ -0,0 +1,2358 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Reshaping data frames by stacking and unstacking\n", + "\n", + "[Data set download](https://s3.amazonaws.com/bebi103.caltech.edu/data/penguins_subset.csv)\n", + "\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "nbsphinx": "hidden", + "tags": [] + }, + "outputs": [], + "source": [ + "# Colab setup ------------------\n", + "import os, sys, subprocess\n", + "if \"google.colab\" in sys.modules:\n", + " cmd = \"pip install --upgrade iqplot bebi103 iqplot watermark\"\n", + " process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n", + " stdout, stderr = process.communicate()\n", + " data_path = \"https://s3.amazonaws.com/bebi103.caltech.edu/data/\"\n", + "else:\n", + " data_path = \"../data/\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " \n", + " Loading BokehJS ...\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function now() {\n", + " return new Date();\n", + " }\n", + "\n", + " const force = true;\n", + "\n", + " if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n", + " root._bokeh_onload_callbacks = [];\n", + " root._bokeh_is_loading = undefined;\n", + " }\n", + "\n", + "const JS_MIME_TYPE = 'application/javascript';\n", + " const HTML_MIME_TYPE = 'text/html';\n", + " const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n", + " const CLASS_NAME = 'output_bokeh rendered_html';\n", + "\n", + " /**\n", + " * Render data to the DOM node\n", + " */\n", + " function render(props, node) {\n", + " const script = document.createElement(\"script\");\n", + " node.appendChild(script);\n", + " }\n", + "\n", + " /**\n", + " * Handle when an output is cleared or removed\n", + " */\n", + " function handleClearOutput(event, handle) {\n", + " const cell = handle.cell;\n", + "\n", + " const id = cell.output_area._bokeh_element_id;\n", + " const server_id = cell.output_area._bokeh_server_id;\n", + " // Clean up Bokeh references\n", + " if (id != null && id in Bokeh.index) {\n", + " Bokeh.index[id].model.document.clear();\n", + " delete Bokeh.index[id];\n", + " }\n", + "\n", + " if (server_id !== undefined) {\n", + " // Clean up Bokeh references\n", + " const cmd_clean = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n", + " cell.notebook.kernel.execute(cmd_clean, {\n", + " iopub: {\n", + " output: function(msg) {\n", + " const id = msg.content.text.trim();\n", + " if (id in Bokeh.index) {\n", + " Bokeh.index[id].model.document.clear();\n", + " delete Bokeh.index[id];\n", + " }\n", + " }\n", + " }\n", + " });\n", + " // Destroy server and session\n", + " const cmd_destroy = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n", + " cell.notebook.kernel.execute(cmd_destroy);\n", + " }\n", + " }\n", + "\n", + " /**\n", + " * Handle when a new output is added\n", + " */\n", + " function handleAddOutput(event, handle) {\n", + " const output_area = handle.output_area;\n", + " const output = handle.output;\n", + "\n", + " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n", + " if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n", + " return\n", + " }\n", + "\n", + " const toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", + "\n", + " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n", + " toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n", + " // store reference to embed id on output_area\n", + " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", + " }\n", + " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", + " const bk_div = document.createElement(\"div\");\n", + " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", + " const script_attrs = bk_div.children[0].attributes;\n", + " for (let i = 0; i < script_attrs.length; i++) {\n", + " toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n", + " toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n", + " }\n", + " // store reference to server id on output_area\n", + " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", + " }\n", + " }\n", + "\n", + " function register_renderer(events, OutputArea) {\n", + "\n", + " function append_mime(data, metadata, element) {\n", + " // create a DOM node to render to\n", + " const toinsert = this.create_output_subarea(\n", + " metadata,\n", + " CLASS_NAME,\n", + " EXEC_MIME_TYPE\n", + " );\n", + " this.keyboard_manager.register_events(toinsert);\n", + " // Render to node\n", + " const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", + " render(props, toinsert[toinsert.length - 1]);\n", + " element.append(toinsert);\n", + " return toinsert\n", + " }\n", + "\n", + " /* Handle when an output is cleared or removed */\n", + " events.on('clear_output.CodeCell', handleClearOutput);\n", + " events.on('delete.Cell', handleClearOutput);\n", + "\n", + " /* Handle when a new output is added */\n", + " events.on('output_added.OutputArea', handleAddOutput);\n", + "\n", + " /**\n", + " * Register the mime type and append_mime function with output_area\n", + " */\n", + " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", + " /* Is output safe? */\n", + " safe: true,\n", + " /* Index of renderer in `output_area.display_order` */\n", + " index: 0\n", + " });\n", + " }\n", + "\n", + " // register the mime type if in Jupyter Notebook environment and previously unregistered\n", + " if (root.Jupyter !== undefined) {\n", + " const events = require('base/js/events');\n", + " const OutputArea = require('notebook/js/outputarea').OutputArea;\n", + "\n", + " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", + " register_renderer(events, OutputArea);\n", + " }\n", + " }\n", + " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", + " root._bokeh_timeout = Date.now() + 5000;\n", + " root._bokeh_failed_load = false;\n", + " }\n", + "\n", + " const NB_LOAD_WARNING = {'data': {'text/html':\n", + " \"
\\n\"+\n", + " \"

\\n\"+\n", + " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", + " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", + " \"

\\n\"+\n", + " \"\\n\"+\n", + " \"\\n\"+\n", + " \"from bokeh.resources import INLINE\\n\"+\n", + " \"output_notebook(resources=INLINE)\\n\"+\n", + " \"\\n\"+\n", + " \"
\"}};\n", + "\n", + " function display_loaded() {\n", + " const el = document.getElementById(\"1002\");\n", + " if (el != null) {\n", + " el.textContent = \"BokehJS is loading...\";\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " if (el != null) {\n", + " el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n", + " }\n", + " } else if (Date.now() < root._bokeh_timeout) {\n", + " setTimeout(display_loaded, 100)\n", + " }\n", + " }\n", + "\n", + " function run_callbacks() {\n", + " try {\n", + " root._bokeh_onload_callbacks.forEach(function(callback) {\n", + " if (callback != null)\n", + " callback();\n", + " });\n", + " } finally {\n", + " delete root._bokeh_onload_callbacks\n", + " }\n", + " console.debug(\"Bokeh: all callbacks have finished\");\n", + " }\n", + "\n", + " function load_libs(css_urls, js_urls, callback) {\n", + " if (css_urls == null) css_urls = [];\n", + " if (js_urls == null) js_urls = [];\n", + "\n", + " root._bokeh_onload_callbacks.push(callback);\n", + " if (root._bokeh_is_loading > 0) {\n", + " console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", + " return null;\n", + " }\n", + " if (js_urls == null || js_urls.length === 0) {\n", + " run_callbacks();\n", + " return null;\n", + " }\n", + " console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", + " root._bokeh_is_loading = css_urls.length + js_urls.length;\n", + "\n", + " function on_load() {\n", + " root._bokeh_is_loading--;\n", + " if (root._bokeh_is_loading === 0) {\n", + " console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n", + " run_callbacks()\n", + " }\n", + " }\n", + "\n", + " function on_error(url) {\n", + " console.error(\"failed to load \" + url);\n", + " }\n", + "\n", + " for (let i = 0; i < css_urls.length; i++) {\n", + " const url = css_urls[i];\n", + " const element = document.createElement(\"link\");\n", + " element.onload = on_load;\n", + " element.onerror = on_error.bind(null, url);\n", + " element.rel = \"stylesheet\";\n", + " element.type = \"text/css\";\n", + " element.href = url;\n", + " console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n", + " document.body.appendChild(element);\n", + " }\n", + "\n", + " for (let i = 0; i < js_urls.length; i++) {\n", + " const url = js_urls[i];\n", + " const element = document.createElement('script');\n", + " element.onload = on_load;\n", + " element.onerror = on_error.bind(null, url);\n", + " element.async = false;\n", + " element.src = url;\n", + " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", + " document.head.appendChild(element);\n", + " }\n", + " };\n", + "\n", + " function inject_raw_css(css) {\n", + " const element = document.createElement(\"style\");\n", + " element.appendChild(document.createTextNode(css));\n", + " document.body.appendChild(element);\n", + " }\n", + "\n", + " const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-2.4.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-2.4.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.4.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.4.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-2.4.3.min.js\"];\n", + " const css_urls = [];\n", + "\n", + " const inline_js = [ function(Bokeh) {\n", + " Bokeh.set_log_level(\"info\");\n", + " },\n", + "function(Bokeh) {\n", + " }\n", + " ];\n", + "\n", + " function run_inline_js() {\n", + " if (root.Bokeh !== undefined || force === true) {\n", + " for (let i = 0; i < inline_js.length; i++) {\n", + " inline_js[i].call(root, root.Bokeh);\n", + " }\n", + "if (force === true) {\n", + " display_loaded();\n", + " }} else if (Date.now() < root._bokeh_timeout) {\n", + " setTimeout(run_inline_js, 100);\n", + " } else if (!root._bokeh_failed_load) {\n", + " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", + " root._bokeh_failed_load = true;\n", + " } else if (force !== true) {\n", + " const cell = $(document.getElementById(\"1002\")).parents('.cell').data().cell;\n", + " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", + " }\n", + " }\n", + "\n", + " if (root._bokeh_is_loading === 0) {\n", + " console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", + " run_inline_js();\n", + " } else {\n", + " load_libs(css_urls, js_urls, function() {\n", + " console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n", + " run_inline_js();\n", + " });\n", + " }\n", + "}(window));" + ], + "application/vnd.bokehjs_load.v0+json": "(function(root) {\n function now() {\n return new Date();\n }\n\n const force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\n\n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n const NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n const el = document.getElementById(\"1002\");\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error(url) {\n console.error(\"failed to load \" + url);\n }\n\n for (let i = 0; i < css_urls.length; i++) {\n const url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n for (let i = 0; i < js_urls.length; i++) {\n const url = js_urls[i];\n const element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-2.4.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-2.4.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.4.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.4.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-2.4.3.min.js\"];\n const css_urls = [];\n\n const inline_js = [ function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\nfunction(Bokeh) {\n }\n ];\n\n function run_inline_js() {\n if (root.Bokeh !== undefined || force === true) {\n for (let i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\nif (force === true) {\n display_loaded();\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n const cell = $(document.getElementById(\"1002\")).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "import iqplot\n", + "\n", + "import bokeh.io\n", + "bokeh.io.output_notebook()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\n", + "We have seen how melting a data frame can bring it to tidy format, but this is not always the best option. **Stacking** and **unstacking** operations can be very useful for shaping data frames. As usual, this is best seen by example, and we will use a subset of the [Palmer penguins data set](https://towardsdatascience.com/penguins-dataset-overview-iris-alternative-9453bb8c8d95), which you can download here: [https://s3.amazonaws.com/bebi103.caltech.edu/data/penguins_subset.csv](https://s3.amazonaws.com/bebi103.caltech.edu/data/penguins_subset.csv). The data set consists of measurements of three different species of penguins acquired at the [Palmer Station in Antarctica](https://en.wikipedia.org/wiki/Palmer_Station). The measurements were made between 2007 and 2009 by [Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Melting the data set too soon\n", + "\n", + "We start by loading in the data set, bearing in mind that for this particular format of it the header rows are rows zero and one." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GentooAdelieChinstrap
bill_depth_mmbill_length_mmflipper_length_mmbody_mass_gbill_depth_mmbill_length_mmflipper_length_mmbody_mass_gbill_depth_mmbill_length_mmflipper_length_mmbody_mass_g
016.348.4220.05400.018.536.8193.03500.018.347.6195.03850.0
115.846.3215.05050.016.937.0185.03000.016.742.5187.03350.0
214.247.5209.04600.019.542.0200.04050.016.640.9187.03200.0
315.748.7208.05350.018.342.7196.04075.020.052.8205.04550.0
414.148.7210.04450.018.035.7202.03550.018.745.4188.03525.0
\n", + "
" + ], + "text/plain": [ + " Gentoo Adelie \\\n", + " bill_depth_mm bill_length_mm flipper_length_mm body_mass_g bill_depth_mm \n", + "0 16.3 48.4 220.0 5400.0 18.5 \n", + "1 15.8 46.3 215.0 5050.0 16.9 \n", + "2 14.2 47.5 209.0 4600.0 19.5 \n", + "3 15.7 48.7 208.0 5350.0 18.3 \n", + "4 14.1 48.7 210.0 4450.0 18.0 \n", + "\n", + " Chinstrap \\\n", + " bill_length_mm flipper_length_mm body_mass_g bill_depth_mm bill_length_mm \n", + "0 36.8 193.0 3500.0 18.3 47.6 \n", + "1 37.0 185.0 3000.0 16.7 42.5 \n", + "2 42.0 200.0 4050.0 16.6 40.9 \n", + "3 42.7 196.0 4075.0 20.0 52.8 \n", + "4 35.7 202.0 3550.0 18.7 45.4 \n", + "\n", + " \n", + " flipper_length_mm body_mass_g \n", + "0 195.0 3850.0 \n", + "1 187.0 3350.0 \n", + "2 187.0 3200.0 \n", + "3 205.0 4550.0 \n", + "4 188.0 3525.0 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(os.path.join(data_path, \"penguins_subset.csv\"), header=[0, 1])\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have a multiindex for the column names. This data set is not tidy because each row corresponds to observations of three different penguins. To tidy it, we could go ahead and melt it.\n", + "\n", + "To ease melting and further analysis, I will do what I always do when I have a multiindex; I will name the levels of the index." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
speciesGentooAdelieChinstrap
propertybill_depth_mmbill_length_mmflipper_length_mmbody_mass_gbill_depth_mmbill_length_mmflipper_length_mmbody_mass_gbill_depth_mmbill_length_mmflipper_length_mmbody_mass_g
016.348.4220.05400.018.536.8193.03500.018.347.6195.03850.0
115.846.3215.05050.016.937.0185.03000.016.742.5187.03350.0
214.247.5209.04600.019.542.0200.04050.016.640.9187.03200.0
315.748.7208.05350.018.342.7196.04075.020.052.8205.04550.0
414.148.7210.04450.018.035.7202.03550.018.745.4188.03525.0
\n", + "
" + ], + "text/plain": [ + "species Gentoo \\\n", + "property bill_depth_mm bill_length_mm flipper_length_mm body_mass_g \n", + "0 16.3 48.4 220.0 5400.0 \n", + "1 15.8 46.3 215.0 5050.0 \n", + "2 14.2 47.5 209.0 4600.0 \n", + "3 15.7 48.7 208.0 5350.0 \n", + "4 14.1 48.7 210.0 4450.0 \n", + "\n", + "species Adelie \\\n", + "property bill_depth_mm bill_length_mm flipper_length_mm body_mass_g \n", + "0 18.5 36.8 193.0 3500.0 \n", + "1 16.9 37.0 185.0 3000.0 \n", + "2 19.5 42.0 200.0 4050.0 \n", + "3 18.3 42.7 196.0 4075.0 \n", + "4 18.0 35.7 202.0 3550.0 \n", + "\n", + "species Chinstrap \n", + "property bill_depth_mm bill_length_mm flipper_length_mm body_mass_g \n", + "0 18.3 47.6 195.0 3850.0 \n", + "1 16.7 42.5 187.0 3350.0 \n", + "2 16.6 40.9 187.0 3200.0 \n", + "3 20.0 52.8 205.0 4550.0 \n", + "4 18.7 45.4 188.0 3525.0 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns.names = [\"species\", \"property\"]\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We are now ready for a melt. Because we have names for the levels of our multiindex, the melt function will automatically name the columns of the resulting melted data frame." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
speciespropertyvalue
0Gentoobill_depth_mm16.3
1Gentoobill_depth_mm15.8
2Gentoobill_depth_mm14.2
3Gentoobill_depth_mm15.7
4Gentoobill_depth_mm14.1
\n", + "
" + ], + "text/plain": [ + " species property value\n", + "0 Gentoo bill_depth_mm 16.3\n", + "1 Gentoo bill_depth_mm 15.8\n", + "2 Gentoo bill_depth_mm 14.2\n", + "3 Gentoo bill_depth_mm 15.7\n", + "4 Gentoo bill_depth_mm 14.1" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_melted = df.melt()\n", + "\n", + "df_melted.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This seems like a tidy data frame, and it is, but we have lost information! Specifically, we lost the which individual penguin each measurements belongs to. A total of 204 penguins were measured (68 for each species), and four properties of each were measured. The melted data frame has 204 × 4 = 816 rows. This is no good, since we want to know which penguin each _set_ of four measurements belongs to.\n", + "\n", + "So, we want a tidy data frame that has five columns, each column containing a variable for one set of measurements. The variables are the beak depth, beak length, flipper length, and weight, and which species the penguin being measured is." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Stacking\n", + "\n", + "**Stacking** is procedure that takes a wide data frame and makes it narrower by converting the names at a level of a columnar multiindex into an index in the data frame. In this case, we want to make the species, as given in the top level of the column muliindex, and make it an index. Let's perform the stacking operation and see what we get." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
propertybill_depth_mmbill_length_mmbody_mass_gflipper_length_mm
species
0Adelie18.536.83500.0193.0
Chinstrap18.347.63850.0195.0
Gentoo16.348.45400.0220.0
1Adelie16.937.03000.0185.0
Chinstrap16.742.53350.0187.0
\n", + "
" + ], + "text/plain": [ + "property bill_depth_mm bill_length_mm body_mass_g flipper_length_mm\n", + " species \n", + "0 Adelie 18.5 36.8 3500.0 193.0\n", + " Chinstrap 18.3 47.6 3850.0 195.0\n", + " Gentoo 16.3 48.4 5400.0 220.0\n", + "1 Adelie 16.9 37.0 3000.0 185.0\n", + " Chinstrap 16.7 42.5 3350.0 187.0" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.stack(level=\"species\")\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We now have a multiindex for the data frame index. The outer index is nameless and the inner index carries the name \"species\". We no longer have a multiindex for the column names, but have the sets of columns we like.\n", + "\n", + "This structure of the data frame makes sense, but it would be easier to understand if we swapped the levels of the index." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
propertybill_depth_mmbill_length_mmbody_mass_gflipper_length_mm
species
Adelie018.536.83500.0193.0
Chinstrap018.347.63850.0195.0
Gentoo016.348.45400.0220.0
Adelie116.937.03000.0185.0
Chinstrap116.742.53350.0187.0
\n", + "
" + ], + "text/plain": [ + "property bill_depth_mm bill_length_mm body_mass_g flipper_length_mm\n", + "species \n", + "Adelie 0 18.5 36.8 3500.0 193.0\n", + "Chinstrap 0 18.3 47.6 3850.0 195.0\n", + "Gentoo 0 16.3 48.4 5400.0 220.0\n", + "Adelie 1 16.9 37.0 3000.0 185.0\n", + "Chinstrap 1 16.7 42.5 3350.0 187.0" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.swaplevel(axis=\"index\")\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, the second index is the measurement number for a given species. It becomes easier to understand if we provide a name for the inner index and sort by species." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
propertybill_depth_mmbill_length_mmbody_mass_gflipper_length_mm
speciespenguin number
Adelie018.536.83500.0193.0
116.937.03000.0185.0
219.542.04050.0200.0
318.342.74075.0196.0
418.035.73550.0202.0
\n", + "
" + ], + "text/plain": [ + "property bill_depth_mm bill_length_mm body_mass_g \\\n", + "species penguin number \n", + "Adelie 0 18.5 36.8 3500.0 \n", + " 1 16.9 37.0 3000.0 \n", + " 2 19.5 42.0 4050.0 \n", + " 3 18.3 42.7 4075.0 \n", + " 4 18.0 35.7 3550.0 \n", + "\n", + "property flipper_length_mm \n", + "species penguin number \n", + "Adelie 0 193.0 \n", + " 1 185.0 \n", + " 2 200.0 \n", + " 3 196.0 \n", + " 4 202.0 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.index.names = [\"species\", \"penguin number\"]\n", + "df = df.sort_index(level=\"species\")\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "However, if we want to make a plot out of this, we need to convert the row index to columns of the data frame, which we can do with the `reset_index()` method." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
propertyspeciespenguin numberbill_depth_mmbill_length_mmbody_mass_gflipper_length_mm
0Adelie018.536.83500.0193.0
1Adelie116.937.03000.0185.0
2Adelie219.542.04050.0200.0
3Adelie318.342.74075.0196.0
4Adelie418.035.73550.0202.0
\n", + "
" + ], + "text/plain": [ + "property species penguin number bill_depth_mm bill_length_mm body_mass_g \\\n", + "0 Adelie 0 18.5 36.8 3500.0 \n", + "1 Adelie 1 16.9 37.0 3000.0 \n", + "2 Adelie 2 19.5 42.0 4050.0 \n", + "3 Adelie 3 18.3 42.7 4075.0 \n", + "4 Adelie 4 18.0 35.7 3550.0 \n", + "\n", + "property flipper_length_mm \n", + "0 193.0 \n", + "1 185.0 \n", + "2 200.0 \n", + "3 196.0 \n", + "4 202.0 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.reset_index()\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we do not need the column index to be named." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
speciespenguin numberbill_depth_mmbill_length_mmbody_mass_gflipper_length_mm
0Adelie018.536.83500.0193.0
1Adelie116.937.03000.0185.0
2Adelie219.542.04050.0200.0
3Adelie318.342.74075.0196.0
4Adelie418.035.73550.0202.0
\n", + "
" + ], + "text/plain": [ + " species penguin number bill_depth_mm bill_length_mm body_mass_g \\\n", + "0 Adelie 0 18.5 36.8 3500.0 \n", + "1 Adelie 1 16.9 37.0 3000.0 \n", + "2 Adelie 2 19.5 42.0 4050.0 \n", + "3 Adelie 3 18.3 42.7 4075.0 \n", + "4 Adelie 4 18.0 35.7 3550.0 \n", + "\n", + " flipper_length_mm \n", + "0 193.0 \n", + "1 185.0 \n", + "2 200.0 \n", + "3 196.0 \n", + "4 202.0 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns.name = None\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Beautiful! A tidy data frame! (Note that we can delete the `'penguin number'` column if we like because it is irrelevant. In most situations, I would delete it, but we will use it for illustrative purposes later in this lesson.)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reshaping by unstacking\n", + "\n", + "Sometimes, our data set is tidy, contains all of the information we need, but is not in a convenient format. As an example, I will create a tidy data frame for the penguin data where each row is a single measurement of a single feature of a penguin. This is what we had before when we melted the data frame too soon, but if we melt the data frame now, specifying `'penguin number'` and `'species'` as ID variables, we get a tidy data frame that still has all of the information in the data set." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
penguin numberspeciesvariablevalue
00Adeliebill_depth_mm18.5
11Adeliebill_depth_mm16.9
22Adeliebill_depth_mm19.5
33Adeliebill_depth_mm18.3
44Adeliebill_depth_mm18.0
\n", + "
" + ], + "text/plain": [ + " penguin number species variable value\n", + "0 0 Adelie bill_depth_mm 18.5\n", + "1 1 Adelie bill_depth_mm 16.9\n", + "2 2 Adelie bill_depth_mm 19.5\n", + "3 3 Adelie bill_depth_mm 18.3\n", + "4 4 Adelie bill_depth_mm 18.0" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.melt(id_vars=[\"penguin number\", \"species\"])\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This format is useful, but working with it can be more painful than in the previous format where each row corresponded to a measurement of a given _penguin_ as opposed to one measurement of one property of a penguin. The present data frame is in a sense too tall. We would like to widen it, or **unstack** it.\n", + "\n", + "To do this unstacking operation, we need to do two steps:\n", + "\n", + "1. Set the index of the data frame to be a multiindex based on all columns that are necessary to have a unique index for each row. In this case, that would be `'species'` and `'penguin number'`. Additionally, use the column whose entries you want to become column names upon unstacking as part of the multiindex. In this case, this is `'variable'`.\n", + "2. Perform an unstacking operation with the level being the level of the multiindex you want to become the column names (in our case `'variable'`).\n", + "\n", + "Let's do it!" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value
variablebill_depth_mmbill_length_mmbody_mass_gflipper_length_mm
speciespenguin number
Adelie018.536.83500.0193.0
116.937.03000.0185.0
219.542.04050.0200.0
318.342.74075.0196.0
418.035.73550.0202.0
\n", + "
" + ], + "text/plain": [ + " value \\\n", + "variable bill_depth_mm bill_length_mm body_mass_g \n", + "species penguin number \n", + "Adelie 0 18.5 36.8 3500.0 \n", + " 1 16.9 37.0 3000.0 \n", + " 2 19.5 42.0 4050.0 \n", + " 3 18.3 42.7 4075.0 \n", + " 4 18.0 35.7 3550.0 \n", + "\n", + " \n", + "variable flipper_length_mm \n", + "species penguin number \n", + "Adelie 0 193.0 \n", + " 1 185.0 \n", + " 2 200.0 \n", + " 3 196.0 \n", + " 4 202.0 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Set index for necessary unique identifiers and column with desired column names\n", + "df = df.set_index([\"species\", \"penguin number\", \"variable\"])\n", + "\n", + "# Unstack\n", + "df = df.unstack(level=\"variable\")\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is close to the shape we want. We have multiindexes for both the rows and columns. To flatten the multiindexed column names, we could use `df.columns.to_flat_index()`, but this converts the multiindex to a single index comprised of tuples. So, the column names would be something like `('value', 'bill_depth_mm')`. Instead, we just want the inner level of the multiindex, which has a name `'variable'`. We can set the columns by getting the values of the indices at this level." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
variablebill_depth_mmbill_length_mmbody_mass_gflipper_length_mm
speciespenguin number
Adelie018.536.83500.0193.0
116.937.03000.0185.0
219.542.04050.0200.0
318.342.74075.0196.0
418.035.73550.0202.0
\n", + "
" + ], + "text/plain": [ + "variable bill_depth_mm bill_length_mm body_mass_g \\\n", + "species penguin number \n", + "Adelie 0 18.5 36.8 3500.0 \n", + " 1 16.9 37.0 3000.0 \n", + " 2 19.5 42.0 4050.0 \n", + " 3 18.3 42.7 4075.0 \n", + " 4 18.0 35.7 3550.0 \n", + "\n", + "variable flipper_length_mm \n", + "species penguin number \n", + "Adelie 0 193.0 \n", + " 1 185.0 \n", + " 2 200.0 \n", + " 3 196.0 \n", + " 4 202.0 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns = df.columns.get_level_values(level=\"variable\")\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now all we are left to do is reset the index to bring the species and penguin number entries from indexes to columns in the data frame." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
variablespeciespenguin numberbill_depth_mmbill_length_mmbody_mass_gflipper_length_mm
0Adelie018.536.83500.0193.0
1Adelie116.937.03000.0185.0
2Adelie219.542.04050.0200.0
3Adelie318.342.74075.0196.0
4Adelie418.035.73550.0202.0
\n", + "
" + ], + "text/plain": [ + "variable species penguin number bill_depth_mm bill_length_mm body_mass_g \\\n", + "0 Adelie 0 18.5 36.8 3500.0 \n", + "1 Adelie 1 16.9 37.0 3000.0 \n", + "2 Adelie 2 19.5 42.0 4050.0 \n", + "3 Adelie 3 18.3 42.7 4075.0 \n", + "4 Adelie 4 18.0 35.7 3550.0 \n", + "\n", + "variable flipper_length_mm \n", + "0 193.0 \n", + "1 185.0 \n", + "2 200.0 \n", + "3 196.0 \n", + "4 202.0 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.reset_index()\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### A couple of plots for fun\n", + "\n", + "Now that we've done all this work and our data set is tidy, let's make a plot for fun. First, we'll plot the ECDFs of the bill lengths." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " const docs_json = {\"3f3defcc-f124-42a8-a706-aa8fc6c1fd1f\":{\"defs\":[],\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"1012\"}],\"center\":[{\"id\":\"1015\"},{\"id\":\"1019\"}],\"frame_height\":275,\"frame_width\":400,\"left\":[{\"id\":\"1016\"}],\"renderers\":[{\"id\":\"1039\"},{\"id\":\"1046\"},{\"id\":\"1053\"}],\"right\":[{\"id\":\"1055\"}],\"title\":{\"id\":\"1059\"},\"toolbar\":{\"id\":\"1027\"},\"toolbar_location\":\"above\",\"x_range\":{\"id\":\"1004\"},\"x_scale\":{\"id\":\"1008\"},\"y_range\":{\"id\":\"1006\"},\"y_scale\":{\"id\":\"1010\"}},\"id\":\"1003\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"1021\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"1025\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"1020\",\"type\":\"PanTool\"},{\"attributes\":{\"label\":{\"value\":\"Chinstrap\"},\"renderers\":[{\"id\":\"1046\"}]},\"id\":\"1057\",\"type\":\"LegendItem\"},{\"attributes\":{\"axis_label\":\"ECDF\",\"coordinates\":null,\"formatter\":{\"id\":\"1062\"},\"group\":null,\"major_label_policy\":{\"id\":\"1063\"},\"ticker\":{\"id\":\"1017\"}},\"id\":\"1016\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1013\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis_label\":\"bill length (mm)\",\"coordinates\":null,\"formatter\":{\"id\":\"1065\"},\"group\":null,\"major_label_policy\":{\"id\":\"1066\"},\"ticker\":{\"id\":\"1013\"}},\"id\":\"1012\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1023\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"1069\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data\":{\"__ECDF\":{\"__ndarray__\":\"S0tLS0tL4z9aWlpaWlraP2lpaWlpaeE/tbS0tLS05D8tLS0tLS3lP/Hw8PDw8Og/AAAAAAAA6D+Ih4eHh4fnP+Lh4eHh4dE/Hh4eHh4e5j9LS0tLS0vbPzw8PDw8POQ/PDw8PDw83D88PDw8PDzsP7W0tLS0tNQ/aWlpaWlp6T+XlpaWlpamP1paWlpaWso/eHh4eHh46D88PDw8PDzMP+Lh4eHh4ek/Hh4eHh4ejj8PDw8PDw/vP9PS0tLS0uo/LS0tLS0t7T+XlpaWlpbmP9PS0tLS0rI/xMPDw8PD6z9aWlpaWlrqP5eWlpaWlu4/eHh4eHh42D8AAAAAAADgP4iHh4eHh+8/l5aWlpaW1j/T0tLS0tLSPw8PDw8PD+c/pqWlpaWl7T8tLS0tLS3dP7W0tLS0tMQ/l5aWlpaWxj/Ew8PDw8PTP9PS0tLS0sI/8fDw8PDw4D8eHh4eHh7OP+Lh4eHh4eE/8fDw8PDw0D+mpaWlpaXlPw8PDw8PD98/WlpaWlpauj+XlpaWlpa2Px4eHh4eHr4/AAAAAAAA0D9aWlpaWlriP2lpaWlpadk/8fDw8PDwwD+Ih4eHh4fXPx4eHh4eHq4/Hh4eHh4enj+1tLS0tLTsPx4eHh4eHu4/Hh4eHh4e3j9LS0tLS0vrP9PS0tLS0uI/xMPDw8PD4z94eHh4eHjgPwAAAAAAAPA/eHh4eHh4yD+mpaWlpaXVPw==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"__label\":[\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\"],\"bill_depth_mm\":{\"__ndarray__\":\"zczMzMxMMECamZmZmZkvQGZmZmZmZixAZmZmZmZmL0AzMzMzMzMsQAAAAAAAAC5AZmZmZmZmL0BmZmZmZmYuQAAAAAAAAC5AMzMzMzMzMEAAAAAAAAAuQAAAAAAAADBAAAAAAAAAK0BmZmZmZmYvQM3MzMzMzCtAzczMzMzML0BmZmZmZmYrQJqZmZmZmSpAMzMzMzMzMECamZmZmZkvQJqZmZmZmS5AZmZmZmZmK0CamZmZmZkvQAAAAAAAAC5AAAAAAACAMECamZmZmRkwQDMzMzMzMypAzczMzMzML0BmZmZmZmYuQDMzMzMzMy9AzczMzMzMLUCamZmZmRkwQAAAAAAAADBAMzMzMzMzLkAAAAAAAAAtQAAAAAAAAC1AAAAAAAAAMUAAAAAAAAAtQGZmZmZmZi1AmpmZmZmZLEDNzMzMzMwrQM3MzMzMTDFAAAAAAAAAL0CamZmZmZkrQDMzMzMzMy5AZmZmZmZmK0AzMzMzMzMuQGZmZmZmZixAzczMzMzMKkAAAAAAAAAtQGZmZmZmZi5AmpmZmZmZLUCamZmZmZksQAAAAAAAAC1AMzMzMzMzK0BmZmZmZmYqQGZmZmZmZixAZmZmZmZmK0DNzMzMzEwxQJqZmZmZGTFAzczMzMzMLECamZmZmZksQDMzMzMzMy9AMzMzMzMzLUCamZmZmZksQAAAAAAAADFAZmZmZmZmL0AzMzMzMzMtQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"bill_length_mm\":{\"__ndarray__\":\"MzMzMzMzSEBmZmZmZiZHQAAAAAAAwEdAmpmZmZlZSECamZmZmVlIQM3MzMzMzEhAZmZmZmamSECamZmZmZlIQAAAAAAAwEZAZmZmZmZmSEAzMzMzMzNHQM3MzMzMTEhAAAAAAABAR0BmZmZmZmZJQJqZmZmZ2UZAAAAAAAAASUCamZmZmVlFQDMzMzMzc0ZAAAAAAADASECamZmZmZlGQAAAAAAAAElAMzMzMzNzREAzMzMzM7NKQM3MzMzMDElAzczMzMyMSUAAAAAAAIBIQDMzMzMzc0VAAAAAAABASUAAAAAAAABJQAAAAAAAQEpAmpmZmZkZR0BmZmZmZmZHQM3MzMzMjEtAzczMzMwMR0AAAAAAAMBGQM3MzMzMjEhAzczMzMwMSkAAAAAAAEBHQAAAAAAAQEZAAAAAAABARkAAAAAAAMBGQDMzMzMzM0ZAmpmZmZmZR0CamZmZmZlGQM3MzMzMDEhAZmZmZmamRkCamZmZmVlIQM3MzMzMTEdAZmZmZmamRUCamZmZmZlFQAAAAAAAwEVAmpmZmZmZRkCamZmZmRlIQJqZmZmZGUdAAAAAAAAARkDNzMzMzAxHQGZmZmZmZkVAzczMzMxMRUBmZmZmZmZJQJqZmZmZGUpAAAAAAABAR0CamZmZmRlJQJqZmZmZGUhAMzMzMzMzSEBmZmZmZmZHQDMzMzMz80tAAAAAAABARkBmZmZmZuZGQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"body_mass_g\":{\"__ndarray__\":\"AAAAAAAYtUAAAAAAALqzQAAAAAAA+LFAAAAAAADmtEAAAAAAAGKxQAAAAAAAjrJAAAAAAADatkAAAAAAAJy4QAAAAAAAiLNAAAAAAABwt0AAAAAAAFyyQAAAAAAAqLZAAAAAAADGsUAAAAAAAFC0QAAAAAAAMLFAAAAAAADmtEAAAAAAANyuQAAAAAAA7LNAAAAAAACotkAAAAAAALS0QAAAAAAArrVAAAAAAAAqskAAAAAAAHy1QAAAAAAAiLNAAAAAAACCtEAAAAAAAK61QAAAAAAAiLNAAAAAAACutUAAAAAAAES2QAAAAAAASrVAAAAAAAC0tEAAAAAAAHy1QAAAAAAA2rZAAAAAAADss0AAAAAAAI6yQAAAAAAAEbJAAAAAAACutUAAAAAAADCxQAAAAAAA8rJAAAAAAAAEsEAAAAAAAGiwQAAAAAAAgrRAAAAAAABvs0AAAAAAAI6yQAAAAAAAfLVAAAAAAADMsEAAAAAAAOa0QAAAAAAA8rJAAAAAAAAwsUAAAAAAAGKxQAAAAAAAKrJAAAAAAABQtEAAAAAAAPixQAAAAAAAwLJAAAAAAAD+sEAAAAAAAJSxQAAAAAAAXLJAAAAAAABWs0AAAAAAAOC1QAAAAAAAGLVAAAAAAAAks0AAAAAAAES2QAAAAAAA7LNAAAAAAADatkAAAAAAAPKyQAAAAAAA4LVAAAAAAAALs0AAAAAAAGiwQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"flipper_length_mm\":{\"__ndarray__\":\"AAAAAACAa0AAAAAAAOBqQAAAAAAAIGpAAAAAAAAAakAAAAAAAEBqQAAAAAAAAGtAAAAAAAAga0AAAAAAAKBrQAAAAAAAgGtAAAAAAADAa0AAAAAAAABrQAAAAAAAwGxAAAAAAABAakAAAAAAAEBsQAAAAAAAwGpAAAAAAAAAbEAAAAAAAABqQAAAAAAAoGpAAAAAAACgbEAAAAAAAOBqQAAAAAAAgGtAAAAAAADAakAAAAAAAGBrQAAAAAAAIGxAAAAAAAAgbEAAAAAAAABrQAAAAAAA4GpAAAAAAADAa0AAAAAAAEBrQAAAAAAAoGtAAAAAAACga0AAAAAAAOBqQAAAAAAAwGxAAAAAAADgakAAAAAAAIBqQAAAAAAAgGpAAAAAAADAbEAAAAAAAKBqQAAAAAAAwGpAAAAAAAAAa0AAAAAAAEBqQAAAAAAAYGtAAAAAAADgakAAAAAAAOBqQAAAAAAAIGpAAAAAAABAakAAAAAAAMBrQAAAAAAAQGpAAAAAAAAgakAAAAAAAABqQAAAAAAAoGpAAAAAAACAakAAAAAAAEBqQAAAAAAAIGpAAAAAAAAAakAAAAAAAGBqQAAAAAAAIGpAAAAAAACgakAAAAAAAIBsQAAAAAAAgGxAAAAAAAAga0AAAAAAAEBrQAAAAAAAoGtAAAAAAACgakAAAAAAAOBqQAAAAAAAgGxAAAAAAAAga0AAAAAAAEBqQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"index\":[136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203],\"penguin number\":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67],\"species\":[\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\"]},\"selected\":{\"id\":\"1072\"},\"selection_policy\":{\"id\":\"1071\"}},\"id\":\"1048\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1024\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"1070\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1062\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"source\":{\"id\":\"1041\"}},\"id\":\"1047\",\"type\":\"CDSView\"},{\"attributes\":{\"tools\":[{\"id\":\"1020\"},{\"id\":\"1021\"},{\"id\":\"1022\"},{\"id\":\"1023\"},{\"id\":\"1024\"},{\"id\":\"1025\"}]},\"id\":\"1027\",\"type\":\"Toolbar\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b3\"},\"line_color\":{\"value\":\"#1f77b3\"},\"x\":{\"field\":\"bill_length_mm\"},\"y\":{\"field\":\"__ECDF\"}},\"id\":\"1036\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1063\",\"type\":\"AllLabels\"},{\"attributes\":{\"coordinates\":null,\"data_source\":{\"id\":\"1048\"},\"glyph\":{\"id\":\"1050\"},\"group\":null,\"hover_glyph\":null,\"muted_glyph\":{\"id\":\"1052\"},\"nonselection_glyph\":{\"id\":\"1051\"},\"view\":{\"id\":\"1054\"}},\"id\":\"1053\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis\":{\"id\":\"1012\"},\"coordinates\":null,\"group\":null,\"ticker\":null},\"id\":\"1015\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"1065\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1066\",\"type\":\"AllLabels\"},{\"attributes\":{\"fill_color\":{\"value\":\"#ff7e0e\"},\"line_color\":{\"value\":\"#ff7e0e\"},\"x\":{\"field\":\"bill_length_mm\"},\"y\":{\"field\":\"__ECDF\"}},\"id\":\"1043\",\"type\":\"Circle\"},{\"attributes\":{\"overlay\":{\"id\":\"1026\"}},\"id\":\"1022\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"1004\",\"type\":\"DataRange1d\"},{\"attributes\":{\"click_policy\":\"hide\",\"coordinates\":null,\"group\":null,\"items\":[{\"id\":\"1056\"},{\"id\":\"1057\"},{\"id\":\"1058\"}],\"location\":\"center\"},\"id\":\"1055\",\"type\":\"Legend\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b3\"},\"hatch_alpha\":{\"value\":0.1},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#1f77b3\"},\"x\":{\"field\":\"bill_length_mm\"},\"y\":{\"field\":\"__ECDF\"}},\"id\":\"1037\",\"type\":\"Circle\"},{\"attributes\":{\"label\":{\"value\":\"Adelie\"},\"renderers\":[{\"id\":\"1039\"}]},\"id\":\"1056\",\"type\":\"LegendItem\"},{\"attributes\":{\"coordinates\":null,\"data_source\":{\"id\":\"1041\"},\"glyph\":{\"id\":\"1043\"},\"group\":null,\"hover_glyph\":null,\"muted_glyph\":{\"id\":\"1045\"},\"nonselection_glyph\":{\"id\":\"1044\"},\"view\":{\"id\":\"1047\"}},\"id\":\"1046\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1067\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1071\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.2},\"fill_color\":{\"value\":\"#ff7e0e\"},\"hatch_alpha\":{\"value\":0.2},\"line_alpha\":{\"value\":0.2},\"line_color\":{\"value\":\"#ff7e0e\"},\"x\":{\"field\":\"bill_length_mm\"},\"y\":{\"field\":\"__ECDF\"}},\"id\":\"1045\",\"type\":\"Circle\"},{\"attributes\":{\"data\":{\"__ECDF\":{\"__ndarray__\":\"pqWlpaWl1T+XlpaWlpbWPzw8PDw8POw/LS0tLS0t7T8eHh4eHh6+Px4eHh4eHuY/8fDw8PDw6D/Ew8PDw8PTP5eWlpaWlqY/iIeHh4eH5z/x8PDw8PDAP9PS0tLS0rI/eHh4eHh46D9paWlpaWnZPzw8PDw8PNw/4uHh4eHh6T+1tLS0tLTsP3h4eHh4eMg/LS0tLS0t5T9aWlpaWlrKP+Lh4eHh4eE/AAAAAAAA4D94eHh4eHjgP2lpaWlpaeE/xMPDw8PD6z8PDw8PDw/fP7W0tLS0tOQ/WlpaWlpa4j9LS0tLS0vjP9PS0tLS0sI/Hh4eHh4ezj8AAAAAAADwP1paWlpaWuo/Hh4eHh4e3j/T0tLS0tLqPx4eHh4eHu4/WlpaWlpa2j8PDw8PDw/nP7W0tLS0tNQ/Hh4eHh4ejj+XlpaWlpa2P4iHh4eHh9c/4uHh4eHh0T9paWlpaWnpP5eWlpaWlu4/Dw8PDw8P7z8AAAAAAADQP/Hw8PDw8OA/Hh4eHh4enj+mpaWlpaXlP7W0tLS0tMQ/l5aWlpaWxj9LS0tLS0vbP9PS0tLS0tI/LS0tLS0t3T94eHh4eHjYPzw8PDw8PMw/xMPDw8PD4z88PDw8PDzkPx4eHh4eHq4/8fDw8PDw0D/T0tLS0tLiP6alpaWlpe0/S0tLS0tL6z+XlpaWlpbmPwAAAAAAAOg/iIeHh4eH7z9aWlpaWlq6Pw==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"__label\":[\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\"],\"bill_depth_mm\":{\"__ndarray__\":\"AAAAAACAMkBmZmZmZuYwQAAAAAAAgDNAzczMzMxMMkAAAAAAAAAyQJqZmZmZGTNAZmZmZmZmMkBmZmZmZmYyQJqZmZmZGTJAAAAAAAAAMkBmZmZmZuYwQDMzMzMzMzFAAAAAAAAAM0DNzMzMzMwxQAAAAAAAgDJAmpmZmZkZM0AzMzMzMzM0QDMzMzMzMzFAzczMzMzMMkDNzMzMzEwxQAAAAAAAADFAmpmZmZkZMkCamZmZmRkxQJqZmZmZmTJAzczMzMxMMkDNzMzMzMwzQM3MzMzMzDFAAAAAAACAMEDNzMzMzMwxQJqZmZmZmTBAAAAAAACAM0AAAAAAAIA1QAAAAAAAADNAmpmZmZkZM0AzMzMzMzMyQAAAAAAAADNAzczMzMzMMEAAAAAAAAAxQM3MzMzMTDNAAAAAAAAAM0BmZmZmZuYxQAAAAAAAgDBAmpmZmZmZMEDNzMzMzMwwQAAAAAAAgDJAAAAAAAAAMkAAAAAAAAAxQM3MzMzMTDFAmpmZmZkZMUAzMzMzM7M0QJqZmZmZGTFAZmZmZmbmMUAAAAAAAIA0QAAAAAAAADJAZmZmZmbmMkCamZmZmRkyQJqZmZmZGTBAzczMzMzMMkAzMzMzM7MyQJqZmZmZGTJAmpmZmZkZMUCamZmZmZkxQAAAAAAAgDJAmpmZmZmZMkBmZmZmZuYyQGZmZmZm5jFAZmZmZmbmMkDNzMzMzMwvQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"bill_length_mm\":{\"__ndarray__\":\"ZmZmZmZmQkAAAAAAAIBCQAAAAAAAAEVAmpmZmZlZRUCamZmZmdlBQGZmZmZm5kNAZmZmZmZmREDNzMzMzExCQM3MzMzMDEFAZmZmZmYmRECamZmZmdlBQM3MzMzMTEFAzczMzMxMREBmZmZmZqZCQAAAAAAAwEJAzczMzMyMREAAAAAAAABFQJqZmZmZGUJAzczMzMzMQ0CamZmZmRlCQM3MzMzMDENAZmZmZmbmQkBmZmZmZuZCQDMzMzMz80JAAAAAAADARECamZmZmdlCQAAAAAAAwENAzczMzMwMQ0AzMzMzM3NDQDMzMzMz80FAZmZmZmYmQkAAAAAAAABHQM3MzMzMjERAzczMzMzMQkDNzMzMzIxEQJqZmZmZmUVAZmZmZmamQkCamZmZmRlEQJqZmZmZWUJAAAAAAADAQEAAAAAAAIBBQAAAAAAAgEJAAAAAAABAQkAzMzMzM3NEQJqZmZmZmUVAzczMzMwMRkAzMzMzMzNCQGZmZmZm5kJAAAAAAAAAQUDNzMzMzMxDQAAAAAAAAEJAAAAAAAAAQkBmZmZmZqZCQAAAAAAAQEJAAAAAAADAQkCamZmZmZlCQJqZmZmZGUJAMzMzMzNzQ0AAAAAAAIBDQAAAAAAAQEFAMzMzMzMzQkDNzMzMzAxDQGZmZmZmZkVAzczMzMyMREDNzMzMzAxEQAAAAAAAQERAZmZmZmbmRkCamZmZmZlBQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"body_mass_g\":{\"__ndarray__\":\"AAAAAABYq0AAAAAAAHCnQAAAAAAApK9AAAAAAADWr0AAAAAAALyrQAAAAAAAKrJAAAAAAAB4rkAAAAAAACarQAAAAAAAJqtAAAAAAABkqUAAAAAAAJyoQAAAAAAAAKlAAAAAAABAr0AAAAAAACyqQAAAAAAAe7FAAAAAAAAEsEAAAAAAAJqwQAAAAAAAnKhAAAAAAAD4sUAAAAAAAMipQAAAAAAAzqhAAAAAAABMrUAAAAAAAMipQAAAAAAAnKhAAAAAAADMsEAAAAAAAFirQAAAAAAAyKlAAAAAAADirUAAAAAAAFKsQAAAAAAA1KdAAAAAAACwrUAAAAAAAGiwQAAAAAAAwqpAAAAAAABMrUAAAAAAAKSvQAAAAAAAp7JAAAAAAABwp0AAAAAAAPSqQAAAAAAA9KpAAAAAAAAgrEAAAAAAABqtQAAAAAAAkKpAAAAAAABEpkAAAAAAAOisQAAAAAAABLBAAAAAAABAr0AAAAAAAPqpQAAAAAAA6KxAAAAAAACQqkAAAAAAAHiuQAAAAAAA6KxAAAAAAAD0qkAAAAAAAH6tQAAAAAAAnKhAAAAAAAA+p0AAAAAAAHiuQAAAAAAAvKtAAAAAAAAgrEAAAAAAAISsQAAAAAAAqKZAAAAAAABEpkAAAAAAAMKqQAAAAAAAmrBAAAAAAAD6qUAAAAAAAMywQAAAAAAAAKlAAAAAAAA2sEAAAAAAANSnQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"flipper_length_mm\":{\"__ndarray__\":\"AAAAAAAgaEAAAAAAACBnQAAAAAAAAGlAAAAAAACAaEAAAAAAAEBpQAAAAAAAAGdAAAAAAABgaEAAAAAAAABnQAAAAAAAIGhAAAAAAABgaEAAAAAAACBnQAAAAAAAoGdAAAAAAADgaEAAAAAAAOBnQAAAAAAA4GhAAAAAAACAZ0AAAAAAAMBnQAAAAAAAYGdAAAAAAADAZ0AAAAAAAGBnQAAAAAAAoGZAAAAAAAAgaEAAAAAAAEBnQAAAAAAAgGVAAAAAAABgaEAAAAAAAMBoQAAAAAAAgGdAAAAAAADAaEAAAAAAAKBmQAAAAAAAwGdAAAAAAADAZ0AAAAAAAEBoQAAAAAAAwGZAAAAAAABAaEAAAAAAAABoQAAAAAAAoGhAAAAAAAAAaEAAAAAAAABmQAAAAAAAIGhAAAAAAADAZ0AAAAAAAABoQAAAAAAAIGdAAAAAAACgZkAAAAAAAOBnQAAAAAAAAGhAAAAAAABAakAAAAAAAGBoQAAAAAAAgGZAAAAAAAAgZ0AAAAAAAOBnQAAAAAAAYGdAAAAAAADAZ0AAAAAAAOBoQAAAAAAAwGZAAAAAAABgZkAAAAAAAEBmQAAAAAAAYGdAAAAAAADAZ0AAAAAAACBnQAAAAAAAYGdAAAAAAAAAZ0AAAAAAAGBnQAAAAAAAYGhAAAAAAACgZ0AAAAAAAIBnQAAAAAAAYGdAAAAAAACgaEAAAAAAAEBnQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"index\":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67],\"penguin number\":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67],\"species\":[\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\"]},\"selected\":{\"id\":\"1068\"},\"selection_policy\":{\"id\":\"1067\"}},\"id\":\"1034\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1072\",\"type\":\"Selection\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#ff7e0e\"},\"hatch_alpha\":{\"value\":0.1},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#ff7e0e\"},\"x\":{\"field\":\"bill_length_mm\"},\"y\":{\"field\":\"__ECDF\"}},\"id\":\"1044\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1068\",\"type\":\"Selection\"},{\"attributes\":{\"source\":{\"id\":\"1034\"}},\"id\":\"1040\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1006\",\"type\":\"DataRange1d\"},{\"attributes\":{\"coordinates\":null,\"data_source\":{\"id\":\"1034\"},\"glyph\":{\"id\":\"1036\"},\"group\":null,\"hover_glyph\":null,\"muted_glyph\":{\"id\":\"1038\"},\"nonselection_glyph\":{\"id\":\"1037\"},\"view\":{\"id\":\"1040\"}},\"id\":\"1039\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"label\":{\"value\":\"Gentoo\"},\"renderers\":[{\"id\":\"1053\"}]},\"id\":\"1058\",\"type\":\"LegendItem\"},{\"attributes\":{\"bottom_units\":\"screen\",\"coordinates\":null,\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"group\":null,\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"syncable\":false,\"top_units\":\"screen\"},\"id\":\"1026\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"coordinates\":null,\"group\":null},\"id\":\"1059\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"1008\",\"type\":\"LinearScale\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.2},\"fill_color\":{\"value\":\"#2ba02b\"},\"hatch_alpha\":{\"value\":0.2},\"line_alpha\":{\"value\":0.2},\"line_color\":{\"value\":\"#2ba02b\"},\"x\":{\"field\":\"bill_length_mm\"},\"y\":{\"field\":\"__ECDF\"}},\"id\":\"1052\",\"type\":\"Circle\"},{\"attributes\":{\"fill_color\":{\"value\":\"#2ba02b\"},\"line_color\":{\"value\":\"#2ba02b\"},\"x\":{\"field\":\"bill_length_mm\"},\"y\":{\"field\":\"__ECDF\"}},\"id\":\"1050\",\"type\":\"Circle\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#2ba02b\"},\"hatch_alpha\":{\"value\":0.1},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#2ba02b\"},\"x\":{\"field\":\"bill_length_mm\"},\"y\":{\"field\":\"__ECDF\"}},\"id\":\"1051\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1010\",\"type\":\"LinearScale\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.2},\"fill_color\":{\"value\":\"#1f77b3\"},\"hatch_alpha\":{\"value\":0.2},\"line_alpha\":{\"value\":0.2},\"line_color\":{\"value\":\"#1f77b3\"},\"x\":{\"field\":\"bill_length_mm\"},\"y\":{\"field\":\"__ECDF\"}},\"id\":\"1038\",\"type\":\"Circle\"},{\"attributes\":{\"data\":{\"__ECDF\":{\"__ndarray__\":\"aWlpaWlp2T+XlpaWlpamPx4eHh4eHo4/Hh4eHh4e7j/x8PDw8PDAP3h4eHh4eOA/S0tLS0tL2z8eHh4eHh7ePx4eHh4eHuY/S0tLS0tL6z/Ew8PDw8PrPzw8PDw8POw/tbS0tLS0xD+XlpaWlpbGP5eWlpaWltY/eHh4eHh4yD94eHh4eHjoPw8PDw8PD98/09LS0tLS4j/x8PDw8PDoPy0tLS0tLeU/Hh4eHh4erj/T0tLS0tLqP8TDw8PDw9M/l5aWlpaW7j9paWlpaWnhPwAAAAAAAPA/pqWlpaWl5T+1tLS0tLTsP5eWlpaWluY/AAAAAAAA0D8eHh4eHh6ePw8PDw8PD+c/tbS0tLS01D88PDw8PDzcP1paWlpaWro/LS0tLS0t3T/T0tLS0tLSPzw8PDw8POQ/WlpaWlpa4j9paWlpaWnpPy0tLS0tLe0/iIeHh4eH1z8eHh4eHh6+P8TDw8PDw+M/pqWlpaWl1T9aWlpaWlrqP/Hw8PDw8NA/WlpaWlpayj/T0tLS0tKyP6alpaWlpe0/WlpaWlpa2j94eHh4eHjYP+Lh4eHh4dE/S0tLS0tL4z+XlpaWlpa2P/Hw8PDw8OA/4uHh4eHh4T8PDw8PDw/vPwAAAAAAAOA/iIeHh4eH7z/T0tLS0tLCP4iHh4eHh+c/AAAAAAAA6D/i4eHh4eHpP7W0tLS0tOQ/Hh4eHh4ezj88PDw8PDzMPw==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"__label\":[\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\"],\"bill_depth_mm\":{\"__ndarray__\":\"zczMzMxMMkAzMzMzM7MwQJqZmZmZmTBAAAAAAAAANEAzMzMzM7MyQDMzMzMzMzJAAAAAAACAMUAzMzMzMzMyQAAAAAAAADNAAAAAAACAM0CamZmZmRkyQDMzMzMzszRAZmZmZmZmM0DNzMzMzEwxQJqZmZmZmTBAAAAAAAAAMUAzMzMzMzMzQGZmZmZm5jNAzczMzMzMMkAzMzMzMzMyQGZmZmZmZjNAzczMzMxMMUDNzMzMzEw0QM3MzMzMzDFAZmZmZmbmM0DNzMzMzEwxQM3MzMzMzDFAMzMzMzOzM0AAAAAAAAAzQAAAAAAAgDJAAAAAAACAMUDNzMzMzEwxQJqZmZmZGTNAZmZmZmbmMUCamZmZmZkzQM3MzMzMzDFAAAAAAACAM0BmZmZmZuYxQJqZmZmZmTNAZmZmZmbmMUBmZmZmZuYzQM3MzMzMzDJAzczMzMxMMUCamZmZmZkwQAAAAAAAADRAAAAAAACAMEAzMzMzM7MyQM3MzMzMzDFAmpmZmZkZMUCamZmZmZkwQM3MzMzMzDNAZmZmZmZmMEDNzMzMzMwwQJqZmZmZmTJAMzMzMzOzMkCamZmZmRkyQJqZmZmZmTJAAAAAAACAM0DNzMzMzMw0QAAAAAAAADNAzczMzMzMM0AAAAAAAAAxQGZmZmZm5jFAzczMzMzMMkAAAAAAAAAzQGZmZmZmZjJAMzMzMzMzMkBmZmZmZuYyQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"bill_length_mm\":{\"__ndarray__\":\"zczMzMzMR0AAAAAAAEBFQDMzMzMzc0RAZmZmZmZmSkAzMzMzM7NGQM3MzMzMzEhAAAAAAABASECamZmZmZlIQGZmZmZmZklAMzMzMzPzSUAAAAAAAABKQAAAAAAAAEpAzczMzMzMRkCamZmZmdlGQDMzMzMzc0dAmpmZmZnZRkBmZmZmZqZJQGZmZmZmpkhAmpmZmZkZSUBmZmZmZqZJQM3MzMzMTElAAAAAAABARUCamZmZmdlJQM3MzMzMTEdAAAAAAADASkBmZmZmZuZIQAAAAAAAAE1AmpmZmZlZSUAAAAAAAABKQGZmZmZmZklAmpmZmZkZR0AzMzMzMzNFQDMzMzMzc0lAmpmZmZlZR0AAAAAAAIBIQJqZmZmZmUZAAAAAAACASEAAAAAAAEBHQAAAAAAAQElAzczMzMwMSUBmZmZmZqZJQJqZmZmZGUpAAAAAAACAR0CamZmZmZlGQGZmZmZmJklAZmZmZmZmR0AAAAAAAMBJQDMzMzMzM0dAMzMzMzPzRkCamZmZmZlFQJqZmZmZWUpAzczMzMwMSEAAAAAAAMBHQDMzMzMzM0dAmpmZmZkZSUAAAAAAAMBFQJqZmZmZ2UhAAAAAAAAASUCamZmZmRlLQAAAAAAAwEhAZmZmZmbmS0AAAAAAAMBGQDMzMzMzc0lAAAAAAACASUAzMzMzM7NJQAAAAAAAQElAzczMzMwMR0AAAAAAAABHQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"body_mass_g\":{\"__ndarray__\":\"AAAAAAAUrkAAAAAAACyqQAAAAAAAAKlAAAAAAADGsUAAAAAAAIqrQAAAAAAAfq1AAAAAAACQqkAAAAAAADCxQAAAAAAABLBAAAAAAADcrkAAAAAAAKSvQAAAAAAAwLJAAAAAAACKq0AAAAAAACCsQAAAAAAAGKVAAAAAAACErEAAAAAAAISsQAAAAAAApK9AAAAAAACwrUAAAAAAAEytQAAAAAAAsK1AAAAAAAAsqkAAAAAAAH6tQAAAAAAAsK1AAAAAAACUsUAAAAAAALasQAAAAAAA6KxAAAAAAACkr0AAAAAAADawQAAAAAAAYrFAAAAAAACErEAAAAAAACCsQAAAAAAAvKtAAAAAAADIqUAAAAAAAMywQAAAAAAA3K5AAAAAAADcrkAAAAAAAFirQAAAAAAApK9AAAAAAACQqkAAAAAAAOisQAAAAAAA9KpAAAAAAADorEAAAAAAAGSpQAAAAAAAyKlAAAAAAACErEAAAAAAAGSpQAAAAAAA6KxAAAAAAADuq0AAAAAAAKimQAAAAAAAGq1AAAAAAAD6qUAAAAAAAHiuQAAAAAAA9KpAAAAAAAB+rUAAAAAAAJCqQAAAAAAAIKxAAAAAAAB4rkAAAAAAAMywQAAAAAAAsK1AAAAAAABAr0AAAAAAAFirQAAAAAAAtqxAAAAAAAAEsEAAAAAAANyuQAAAAAAAkKpAAAAAAABkqUAAAAAAADawQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"flipper_length_mm\":{\"__ndarray__\":\"AAAAAABgaEAAAAAAAGBnQAAAAAAAYGdAAAAAAACgaUAAAAAAAIBnQAAAAAAAIGhAAAAAAADgZ0AAAAAAAGBoQAAAAAAAQGpAAAAAAADAaUAAAAAAACBpQAAAAAAAQGpAAAAAAABAaEAAAAAAACBoQAAAAAAAAGhAAAAAAABgaEAAAAAAACBoQAAAAAAAYGlAAAAAAABAaUAAAAAAAKBoQAAAAAAAIGhAAAAAAABgZ0AAAAAAAEBoQAAAAAAAIGhAAAAAAACgaUAAAAAAAMBoQAAAAAAAoGZAAAAAAABgaUAAAAAAAKBoQAAAAAAAIGlAAAAAAABgZ0AAAAAAAKBmQAAAAAAAgGhAAAAAAABgaEAAAAAAAIBqQAAAAAAAwGhAAAAAAABAakAAAAAAAABoQAAAAAAAIGlAAAAAAADAZ0AAAAAAAMBoQAAAAAAAoGhAAAAAAAAgZ0AAAAAAAOBnQAAAAAAAoGhAAAAAAACgZ0AAAAAAAGBnQAAAAAAA4GdAAAAAAADAZ0AAAAAAAGBnQAAAAAAAoGhAAAAAAADgaEAAAAAAAOBoQAAAAAAAwGdAAAAAAADAaEAAAAAAAEBpQAAAAAAAYGhAAAAAAACAaEAAAAAAACBpQAAAAAAAAGlAAAAAAADgaUAAAAAAAIBoQAAAAAAAgGhAAAAAAABgaUAAAAAAACBpQAAAAAAAAGlAAAAAAABAZkAAAAAAAGBoQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"index\":[68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135],\"penguin number\":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67],\"species\":[\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\"]},\"selected\":{\"id\":\"1070\"},\"selection_policy\":{\"id\":\"1069\"}},\"id\":\"1041\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1017\",\"type\":\"BasicTicker\"},{\"attributes\":{\"source\":{\"id\":\"1048\"}},\"id\":\"1054\",\"type\":\"CDSView\"},{\"attributes\":{\"axis\":{\"id\":\"1016\"},\"coordinates\":null,\"dimension\":1,\"group\":null,\"ticker\":null},\"id\":\"1019\",\"type\":\"Grid\"}],\"root_ids\":[\"1003\"]},\"title\":\"Bokeh Application\",\"version\":\"2.4.3\"}};\n", + " const render_items = [{\"docid\":\"3f3defcc-f124-42a8-a706-aa8fc6c1fd1f\",\"root_ids\":[\"1003\"],\"roots\":{\"1003\":\"db10addd-fa97-496d-9d3b-e2dd4bbf4425\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " let attempts = 0;\n", + " const timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1003" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "bokeh.io.show(\n", + " iqplot.ecdf(\n", + " data=df,\n", + " cats='species',\n", + " q='bill_length_mm',\n", + " x_axis_label='bill length (mm)',\n", + " frame_width=400,\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also plot bill length versus flipper length to see if we can see a difference among the species." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " const docs_json = {\"2647b03c-76bb-4c14-ab26-4a64702e9ccd\":{\"defs\":[],\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"1158\"}],\"center\":[{\"id\":\"1161\"},{\"id\":\"1165\"}],\"frame_height\":300,\"frame_width\":300,\"left\":[{\"id\":\"1162\"}],\"renderers\":[{\"id\":\"1185\"},{\"id\":\"1192\"},{\"id\":\"1199\"}],\"right\":[{\"id\":\"1201\"}],\"title\":{\"id\":\"1220\"},\"toolbar\":{\"id\":\"1173\"},\"toolbar_location\":\"above\",\"x_range\":{\"id\":\"1150\"},\"x_scale\":{\"id\":\"1154\"},\"y_range\":{\"id\":\"1152\"},\"y_scale\":{\"id\":\"1156\"}},\"id\":\"1149\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"1233\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1229\",\"type\":\"Selection\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.2},\"fill_color\":{\"value\":\"#1f77b4\"},\"hatch_alpha\":{\"value\":0.2},\"hatch_color\":{\"value\":\"#1f77b4\"},\"line_alpha\":{\"value\":0.2},\"line_color\":{\"value\":\"#1f77b4\"},\"x\":{\"field\":\"bill_length_mm\"},\"y\":{\"field\":\"flipper_length_mm\"}},\"id\":\"1184\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1232\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"source\":{\"id\":\"1180\"}},\"id\":\"1186\",\"type\":\"CDSView\"},{\"attributes\":{\"fill_color\":{\"value\":\"#ff7f0e\"},\"hatch_color\":{\"value\":\"#ff7f0e\"},\"line_color\":{\"value\":\"#ff7f0e\"},\"x\":{\"field\":\"bill_length_mm\"},\"y\":{\"field\":\"flipper_length_mm\"}},\"id\":\"1189\",\"type\":\"Circle\"},{\"attributes\":{\"coordinates\":null,\"data_source\":{\"id\":\"1187\"},\"glyph\":{\"id\":\"1189\"},\"group\":null,\"hover_glyph\":null,\"muted_glyph\":{\"id\":\"1191\"},\"nonselection_glyph\":{\"id\":\"1190\"},\"view\":{\"id\":\"1193\"}},\"id\":\"1192\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"coordinates\":null,\"data_source\":{\"id\":\"1194\"},\"glyph\":{\"id\":\"1196\"},\"group\":null,\"hover_glyph\":null,\"muted_glyph\":{\"id\":\"1198\"},\"nonselection_glyph\":{\"id\":\"1197\"},\"view\":{\"id\":\"1200\"}},\"id\":\"1199\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data\":{\"bill_depth_mm\":{\"__ndarray__\":\"zczMzMxMMECamZmZmZkvQGZmZmZmZixAZmZmZmZmL0AzMzMzMzMsQAAAAAAAAC5AZmZmZmZmL0BmZmZmZmYuQAAAAAAAAC5AMzMzMzMzMEAAAAAAAAAuQAAAAAAAADBAAAAAAAAAK0BmZmZmZmYvQM3MzMzMzCtAzczMzMzML0BmZmZmZmYrQJqZmZmZmSpAMzMzMzMzMECamZmZmZkvQJqZmZmZmS5AZmZmZmZmK0CamZmZmZkvQAAAAAAAAC5AAAAAAACAMECamZmZmRkwQDMzMzMzMypAzczMzMzML0BmZmZmZmYuQDMzMzMzMy9AzczMzMzMLUCamZmZmRkwQAAAAAAAADBAMzMzMzMzLkAAAAAAAAAtQAAAAAAAAC1AAAAAAAAAMUAAAAAAAAAtQGZmZmZmZi1AmpmZmZmZLEDNzMzMzMwrQM3MzMzMTDFAAAAAAAAAL0CamZmZmZkrQDMzMzMzMy5AZmZmZmZmK0AzMzMzMzMuQGZmZmZmZixAzczMzMzMKkAAAAAAAAAtQGZmZmZmZi5AmpmZmZmZLUCamZmZmZksQAAAAAAAAC1AMzMzMzMzK0BmZmZmZmYqQGZmZmZmZixAZmZmZmZmK0DNzMzMzEwxQJqZmZmZGTFAzczMzMzMLECamZmZmZksQDMzMzMzMy9AMzMzMzMzLUCamZmZmZksQAAAAAAAADFAZmZmZmZmL0AzMzMzMzMtQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"bill_length_mm\":{\"__ndarray__\":\"MzMzMzMzSEBmZmZmZiZHQAAAAAAAwEdAmpmZmZlZSECamZmZmVlIQM3MzMzMzEhAZmZmZmamSECamZmZmZlIQAAAAAAAwEZAZmZmZmZmSEAzMzMzMzNHQM3MzMzMTEhAAAAAAABAR0BmZmZmZmZJQJqZmZmZ2UZAAAAAAAAASUCamZmZmVlFQDMzMzMzc0ZAAAAAAADASECamZmZmZlGQAAAAAAAAElAMzMzMzNzREAzMzMzM7NKQM3MzMzMDElAzczMzMyMSUAAAAAAAIBIQDMzMzMzc0VAAAAAAABASUAAAAAAAABJQAAAAAAAQEpAmpmZmZkZR0BmZmZmZmZHQM3MzMzMjEtAzczMzMwMR0AAAAAAAMBGQM3MzMzMjEhAzczMzMwMSkAAAAAAAEBHQAAAAAAAQEZAAAAAAABARkAAAAAAAMBGQDMzMzMzM0ZAmpmZmZmZR0CamZmZmZlGQM3MzMzMDEhAZmZmZmamRkCamZmZmVlIQM3MzMzMTEdAZmZmZmamRUCamZmZmZlFQAAAAAAAwEVAmpmZmZmZRkCamZmZmRlIQJqZmZmZGUdAAAAAAAAARkDNzMzMzAxHQGZmZmZmZkVAzczMzMxMRUBmZmZmZmZJQJqZmZmZGUpAAAAAAABAR0CamZmZmRlJQJqZmZmZGUhAMzMzMzMzSEBmZmZmZmZHQDMzMzMz80tAAAAAAABARkBmZmZmZuZGQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"body_mass_g\":{\"__ndarray__\":\"AAAAAAAYtUAAAAAAALqzQAAAAAAA+LFAAAAAAADmtEAAAAAAAGKxQAAAAAAAjrJAAAAAAADatkAAAAAAAJy4QAAAAAAAiLNAAAAAAABwt0AAAAAAAFyyQAAAAAAAqLZAAAAAAADGsUAAAAAAAFC0QAAAAAAAMLFAAAAAAADmtEAAAAAAANyuQAAAAAAA7LNAAAAAAACotkAAAAAAALS0QAAAAAAArrVAAAAAAAAqskAAAAAAAHy1QAAAAAAAiLNAAAAAAACCtEAAAAAAAK61QAAAAAAAiLNAAAAAAACutUAAAAAAAES2QAAAAAAASrVAAAAAAAC0tEAAAAAAAHy1QAAAAAAA2rZAAAAAAADss0AAAAAAAI6yQAAAAAAAEbJAAAAAAACutUAAAAAAADCxQAAAAAAA8rJAAAAAAAAEsEAAAAAAAGiwQAAAAAAAgrRAAAAAAABvs0AAAAAAAI6yQAAAAAAAfLVAAAAAAADMsEAAAAAAAOa0QAAAAAAA8rJAAAAAAAAwsUAAAAAAAGKxQAAAAAAAKrJAAAAAAABQtEAAAAAAAPixQAAAAAAAwLJAAAAAAAD+sEAAAAAAAJSxQAAAAAAAXLJAAAAAAABWs0AAAAAAAOC1QAAAAAAAGLVAAAAAAAAks0AAAAAAAES2QAAAAAAA7LNAAAAAAADatkAAAAAAAPKyQAAAAAAA4LVAAAAAAAALs0AAAAAAAGiwQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"flipper_length_mm\":{\"__ndarray__\":\"AAAAAACAa0AAAAAAAOBqQAAAAAAAIGpAAAAAAAAAakAAAAAAAEBqQAAAAAAAAGtAAAAAAAAga0AAAAAAAKBrQAAAAAAAgGtAAAAAAADAa0AAAAAAAABrQAAAAAAAwGxAAAAAAABAakAAAAAAAEBsQAAAAAAAwGpAAAAAAAAAbEAAAAAAAABqQAAAAAAAoGpAAAAAAACgbEAAAAAAAOBqQAAAAAAAgGtAAAAAAADAakAAAAAAAGBrQAAAAAAAIGxAAAAAAAAgbEAAAAAAAABrQAAAAAAA4GpAAAAAAADAa0AAAAAAAEBrQAAAAAAAoGtAAAAAAACga0AAAAAAAOBqQAAAAAAAwGxAAAAAAADgakAAAAAAAIBqQAAAAAAAgGpAAAAAAADAbEAAAAAAAKBqQAAAAAAAwGpAAAAAAAAAa0AAAAAAAEBqQAAAAAAAYGtAAAAAAADgakAAAAAAAOBqQAAAAAAAIGpAAAAAAABAakAAAAAAAMBrQAAAAAAAQGpAAAAAAAAgakAAAAAAAABqQAAAAAAAoGpAAAAAAACAakAAAAAAAEBqQAAAAAAAIGpAAAAAAAAAakAAAAAAAGBqQAAAAAAAIGpAAAAAAACgakAAAAAAAIBsQAAAAAAAgGxAAAAAAAAga0AAAAAAAEBrQAAAAAAAoGtAAAAAAACgakAAAAAAAOBqQAAAAAAAgGxAAAAAAAAga0AAAAAAAEBqQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"index\":[136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203],\"penguin number\":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67],\"species\":[\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\"]},\"selected\":{\"id\":\"1233\"},\"selection_policy\":{\"id\":\"1232\"}},\"id\":\"1194\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"coordinates\":null,\"data_source\":{\"id\":\"1180\"},\"glyph\":{\"id\":\"1182\"},\"group\":null,\"hover_glyph\":null,\"muted_glyph\":{\"id\":\"1184\"},\"nonselection_glyph\":{\"id\":\"1183\"},\"view\":{\"id\":\"1186\"}},\"id\":\"1185\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1170\",\"type\":\"ResetTool\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"hatch_alpha\":{\"value\":0.1},\"hatch_color\":{\"value\":\"#1f77b4\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#1f77b4\"},\"x\":{\"field\":\"bill_length_mm\"},\"y\":{\"field\":\"flipper_length_mm\"}},\"id\":\"1183\",\"type\":\"Circle\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#ff7f0e\"},\"hatch_alpha\":{\"value\":0.1},\"hatch_color\":{\"value\":\"#ff7f0e\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#ff7f0e\"},\"x\":{\"field\":\"bill_length_mm\"},\"y\":{\"field\":\"flipper_length_mm\"}},\"id\":\"1190\",\"type\":\"Circle\"},{\"attributes\":{\"axis\":{\"id\":\"1162\"},\"coordinates\":null,\"dimension\":1,\"group\":null,\"ticker\":null},\"id\":\"1165\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"1223\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"axis\":{\"id\":\"1158\"},\"coordinates\":null,\"group\":null,\"ticker\":null},\"id\":\"1161\",\"type\":\"Grid\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.2},\"fill_color\":{\"value\":\"#ff7f0e\"},\"hatch_alpha\":{\"value\":0.2},\"hatch_color\":{\"value\":\"#ff7f0e\"},\"line_alpha\":{\"value\":0.2},\"line_color\":{\"value\":\"#ff7f0e\"},\"x\":{\"field\":\"bill_length_mm\"},\"y\":{\"field\":\"flipper_length_mm\"}},\"id\":\"1191\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1159\",\"type\":\"BasicTicker\"},{\"attributes\":{\"coordinates\":null,\"group\":null},\"id\":\"1220\",\"type\":\"Title\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.2},\"fill_color\":{\"value\":\"#2ca02c\"},\"hatch_alpha\":{\"value\":0.2},\"hatch_color\":{\"value\":\"#2ca02c\"},\"line_alpha\":{\"value\":0.2},\"line_color\":{\"value\":\"#2ca02c\"},\"x\":{\"field\":\"bill_length_mm\"},\"y\":{\"field\":\"flipper_length_mm\"}},\"id\":\"1198\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1169\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"1163\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1154\",\"type\":\"LinearScale\"},{\"attributes\":{\"overlay\":{\"id\":\"1172\"}},\"id\":\"1168\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"bottom_units\":\"screen\",\"coordinates\":null,\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"group\":null,\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"syncable\":false,\"top_units\":\"screen\"},\"id\":\"1172\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"data\":{\"bill_depth_mm\":{\"__ndarray__\":\"AAAAAACAMkBmZmZmZuYwQAAAAAAAgDNAzczMzMxMMkAAAAAAAAAyQJqZmZmZGTNAZmZmZmZmMkBmZmZmZmYyQJqZmZmZGTJAAAAAAAAAMkBmZmZmZuYwQDMzMzMzMzFAAAAAAAAAM0DNzMzMzMwxQAAAAAAAgDJAmpmZmZkZM0AzMzMzMzM0QDMzMzMzMzFAzczMzMzMMkDNzMzMzEwxQAAAAAAAADFAmpmZmZkZMkCamZmZmRkxQJqZmZmZmTJAzczMzMxMMkDNzMzMzMwzQM3MzMzMzDFAAAAAAACAMEDNzMzMzMwxQJqZmZmZmTBAAAAAAACAM0AAAAAAAIA1QAAAAAAAADNAmpmZmZkZM0AzMzMzMzMyQAAAAAAAADNAzczMzMzMMEAAAAAAAAAxQM3MzMzMTDNAAAAAAAAAM0BmZmZmZuYxQAAAAAAAgDBAmpmZmZmZMEDNzMzMzMwwQAAAAAAAgDJAAAAAAAAAMkAAAAAAAAAxQM3MzMzMTDFAmpmZmZkZMUAzMzMzM7M0QJqZmZmZGTFAZmZmZmbmMUAAAAAAAIA0QAAAAAAAADJAZmZmZmbmMkCamZmZmRkyQJqZmZmZGTBAzczMzMzMMkAzMzMzM7MyQJqZmZmZGTJAmpmZmZkZMUCamZmZmZkxQAAAAAAAgDJAmpmZmZmZMkBmZmZmZuYyQGZmZmZm5jFAZmZmZmbmMkDNzMzMzMwvQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"bill_length_mm\":{\"__ndarray__\":\"ZmZmZmZmQkAAAAAAAIBCQAAAAAAAAEVAmpmZmZlZRUCamZmZmdlBQGZmZmZm5kNAZmZmZmZmREDNzMzMzExCQM3MzMzMDEFAZmZmZmYmRECamZmZmdlBQM3MzMzMTEFAzczMzMxMREBmZmZmZqZCQAAAAAAAwEJAzczMzMyMREAAAAAAAABFQJqZmZmZGUJAzczMzMzMQ0CamZmZmRlCQM3MzMzMDENAZmZmZmbmQkBmZmZmZuZCQDMzMzMz80JAAAAAAADARECamZmZmdlCQAAAAAAAwENAzczMzMwMQ0AzMzMzM3NDQDMzMzMz80FAZmZmZmYmQkAAAAAAAABHQM3MzMzMjERAzczMzMzMQkDNzMzMzIxEQJqZmZmZmUVAZmZmZmamQkCamZmZmRlEQJqZmZmZWUJAAAAAAADAQEAAAAAAAIBBQAAAAAAAgEJAAAAAAABAQkAzMzMzM3NEQJqZmZmZmUVAzczMzMwMRkAzMzMzMzNCQGZmZmZm5kJAAAAAAAAAQUDNzMzMzMxDQAAAAAAAAEJAAAAAAAAAQkBmZmZmZqZCQAAAAAAAQEJAAAAAAADAQkCamZmZmZlCQJqZmZmZGUJAMzMzMzNzQ0AAAAAAAIBDQAAAAAAAQEFAMzMzMzMzQkDNzMzMzAxDQGZmZmZmZkVAzczMzMyMREDNzMzMzAxEQAAAAAAAQERAZmZmZmbmRkCamZmZmZlBQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"body_mass_g\":{\"__ndarray__\":\"AAAAAABYq0AAAAAAAHCnQAAAAAAApK9AAAAAAADWr0AAAAAAALyrQAAAAAAAKrJAAAAAAAB4rkAAAAAAACarQAAAAAAAJqtAAAAAAABkqUAAAAAAAJyoQAAAAAAAAKlAAAAAAABAr0AAAAAAACyqQAAAAAAAe7FAAAAAAAAEsEAAAAAAAJqwQAAAAAAAnKhAAAAAAAD4sUAAAAAAAMipQAAAAAAAzqhAAAAAAABMrUAAAAAAAMipQAAAAAAAnKhAAAAAAADMsEAAAAAAAFirQAAAAAAAyKlAAAAAAADirUAAAAAAAFKsQAAAAAAA1KdAAAAAAACwrUAAAAAAAGiwQAAAAAAAwqpAAAAAAABMrUAAAAAAAKSvQAAAAAAAp7JAAAAAAABwp0AAAAAAAPSqQAAAAAAA9KpAAAAAAAAgrEAAAAAAABqtQAAAAAAAkKpAAAAAAABEpkAAAAAAAOisQAAAAAAABLBAAAAAAABAr0AAAAAAAPqpQAAAAAAA6KxAAAAAAACQqkAAAAAAAHiuQAAAAAAA6KxAAAAAAAD0qkAAAAAAAH6tQAAAAAAAnKhAAAAAAAA+p0AAAAAAAHiuQAAAAAAAvKtAAAAAAAAgrEAAAAAAAISsQAAAAAAAqKZAAAAAAABEpkAAAAAAAMKqQAAAAAAAmrBAAAAAAAD6qUAAAAAAAMywQAAAAAAAAKlAAAAAAAA2sEAAAAAAANSnQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"flipper_length_mm\":{\"__ndarray__\":\"AAAAAAAgaEAAAAAAACBnQAAAAAAAAGlAAAAAAACAaEAAAAAAAEBpQAAAAAAAAGdAAAAAAABgaEAAAAAAAABnQAAAAAAAIGhAAAAAAABgaEAAAAAAACBnQAAAAAAAoGdAAAAAAADgaEAAAAAAAOBnQAAAAAAA4GhAAAAAAACAZ0AAAAAAAMBnQAAAAAAAYGdAAAAAAADAZ0AAAAAAAGBnQAAAAAAAoGZAAAAAAAAgaEAAAAAAAEBnQAAAAAAAgGVAAAAAAABgaEAAAAAAAMBoQAAAAAAAgGdAAAAAAADAaEAAAAAAAKBmQAAAAAAAwGdAAAAAAADAZ0AAAAAAAEBoQAAAAAAAwGZAAAAAAABAaEAAAAAAAABoQAAAAAAAoGhAAAAAAAAAaEAAAAAAAABmQAAAAAAAIGhAAAAAAADAZ0AAAAAAAABoQAAAAAAAIGdAAAAAAACgZkAAAAAAAOBnQAAAAAAAAGhAAAAAAABAakAAAAAAAGBoQAAAAAAAgGZAAAAAAAAgZ0AAAAAAAOBnQAAAAAAAYGdAAAAAAADAZ0AAAAAAAOBoQAAAAAAAwGZAAAAAAABgZkAAAAAAAEBmQAAAAAAAYGdAAAAAAADAZ0AAAAAAACBnQAAAAAAAYGdAAAAAAAAAZ0AAAAAAAGBnQAAAAAAAYGhAAAAAAACgZ0AAAAAAAIBnQAAAAAAAYGdAAAAAAACgaEAAAAAAAEBnQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"index\":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67],\"penguin number\":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67],\"species\":[\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\"]},\"selected\":{\"id\":\"1229\"},\"selection_policy\":{\"id\":\"1228\"}},\"id\":\"1180\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"axis_label\":\"flipper length (mm)\",\"coordinates\":null,\"formatter\":{\"id\":\"1223\"},\"group\":null,\"major_label_policy\":{\"id\":\"1224\"},\"ticker\":{\"id\":\"1163\"}},\"id\":\"1162\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1230\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"axis_label\":\"bill length (mm)\",\"coordinates\":null,\"formatter\":{\"id\":\"1226\"},\"group\":null,\"major_label_policy\":{\"id\":\"1227\"},\"ticker\":{\"id\":\"1159\"}},\"id\":\"1158\",\"type\":\"LinearAxis\"},{\"attributes\":{\"source\":{\"id\":\"1194\"}},\"id\":\"1200\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1224\",\"type\":\"AllLabels\"},{\"attributes\":{},\"id\":\"1156\",\"type\":\"LinearScale\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"hatch_color\":{\"value\":\"#1f77b4\"},\"line_color\":{\"value\":\"#1f77b4\"},\"x\":{\"field\":\"bill_length_mm\"},\"y\":{\"field\":\"flipper_length_mm\"}},\"id\":\"1182\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1231\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1152\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1226\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"label\":{\"value\":\"Adelie\"},\"renderers\":[{\"id\":\"1185\"}]},\"id\":\"1202\",\"type\":\"LegendItem\"},{\"attributes\":{},\"id\":\"1166\",\"type\":\"PanTool\"},{\"attributes\":{\"source\":{\"id\":\"1187\"}},\"id\":\"1193\",\"type\":\"CDSView\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#2ca02c\"},\"hatch_alpha\":{\"value\":0.1},\"hatch_color\":{\"value\":\"#2ca02c\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#2ca02c\"},\"x\":{\"field\":\"bill_length_mm\"},\"y\":{\"field\":\"flipper_length_mm\"}},\"id\":\"1197\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1171\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"1150\",\"type\":\"DataRange1d\"},{\"attributes\":{\"click_policy\":\"hide\",\"coordinates\":null,\"group\":null,\"items\":[{\"id\":\"1202\"},{\"id\":\"1203\"},{\"id\":\"1204\"}],\"location\":\"center\"},\"id\":\"1201\",\"type\":\"Legend\"},{\"attributes\":{},\"id\":\"1227\",\"type\":\"AllLabels\"},{\"attributes\":{\"label\":{\"value\":\"Gentoo\"},\"renderers\":[{\"id\":\"1199\"}]},\"id\":\"1204\",\"type\":\"LegendItem\"},{\"attributes\":{},\"id\":\"1167\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"tools\":[{\"id\":\"1166\"},{\"id\":\"1167\"},{\"id\":\"1168\"},{\"id\":\"1169\"},{\"id\":\"1170\"},{\"id\":\"1171\"}]},\"id\":\"1173\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"1228\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data\":{\"bill_depth_mm\":{\"__ndarray__\":\"zczMzMxMMkAzMzMzM7MwQJqZmZmZmTBAAAAAAAAANEAzMzMzM7MyQDMzMzMzMzJAAAAAAACAMUAzMzMzMzMyQAAAAAAAADNAAAAAAACAM0CamZmZmRkyQDMzMzMzszRAZmZmZmZmM0DNzMzMzEwxQJqZmZmZmTBAAAAAAAAAMUAzMzMzMzMzQGZmZmZm5jNAzczMzMzMMkAzMzMzMzMyQGZmZmZmZjNAzczMzMxMMUDNzMzMzEw0QM3MzMzMzDFAZmZmZmbmM0DNzMzMzEwxQM3MzMzMzDFAMzMzMzOzM0AAAAAAAAAzQAAAAAAAgDJAAAAAAACAMUDNzMzMzEwxQJqZmZmZGTNAZmZmZmbmMUCamZmZmZkzQM3MzMzMzDFAAAAAAACAM0BmZmZmZuYxQJqZmZmZmTNAZmZmZmbmMUBmZmZmZuYzQM3MzMzMzDJAzczMzMxMMUCamZmZmZkwQAAAAAAAADRAAAAAAACAMEAzMzMzM7MyQM3MzMzMzDFAmpmZmZkZMUCamZmZmZkwQM3MzMzMzDNAZmZmZmZmMEDNzMzMzMwwQJqZmZmZmTJAMzMzMzOzMkCamZmZmRkyQJqZmZmZmTJAAAAAAACAM0DNzMzMzMw0QAAAAAAAADNAzczMzMzMM0AAAAAAAAAxQGZmZmZm5jFAzczMzMzMMkAAAAAAAAAzQGZmZmZmZjJAMzMzMzMzMkBmZmZmZuYyQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"bill_length_mm\":{\"__ndarray__\":\"zczMzMzMR0AAAAAAAEBFQDMzMzMzc0RAZmZmZmZmSkAzMzMzM7NGQM3MzMzMzEhAAAAAAABASECamZmZmZlIQGZmZmZmZklAMzMzMzPzSUAAAAAAAABKQAAAAAAAAEpAzczMzMzMRkCamZmZmdlGQDMzMzMzc0dAmpmZmZnZRkBmZmZmZqZJQGZmZmZmpkhAmpmZmZkZSUBmZmZmZqZJQM3MzMzMTElAAAAAAABARUCamZmZmdlJQM3MzMzMTEdAAAAAAADASkBmZmZmZuZIQAAAAAAAAE1AmpmZmZlZSUAAAAAAAABKQGZmZmZmZklAmpmZmZkZR0AzMzMzMzNFQDMzMzMzc0lAmpmZmZlZR0AAAAAAAIBIQJqZmZmZmUZAAAAAAACASEAAAAAAAEBHQAAAAAAAQElAzczMzMwMSUBmZmZmZqZJQJqZmZmZGUpAAAAAAACAR0CamZmZmZlGQGZmZmZmJklAZmZmZmZmR0AAAAAAAMBJQDMzMzMzM0dAMzMzMzPzRkCamZmZmZlFQJqZmZmZWUpAzczMzMwMSEAAAAAAAMBHQDMzMzMzM0dAmpmZmZkZSUAAAAAAAMBFQJqZmZmZ2UhAAAAAAAAASUCamZmZmRlLQAAAAAAAwEhAZmZmZmbmS0AAAAAAAMBGQDMzMzMzc0lAAAAAAACASUAzMzMzM7NJQAAAAAAAQElAzczMzMwMR0AAAAAAAABHQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"body_mass_g\":{\"__ndarray__\":\"AAAAAAAUrkAAAAAAACyqQAAAAAAAAKlAAAAAAADGsUAAAAAAAIqrQAAAAAAAfq1AAAAAAACQqkAAAAAAADCxQAAAAAAABLBAAAAAAADcrkAAAAAAAKSvQAAAAAAAwLJAAAAAAACKq0AAAAAAACCsQAAAAAAAGKVAAAAAAACErEAAAAAAAISsQAAAAAAApK9AAAAAAACwrUAAAAAAAEytQAAAAAAAsK1AAAAAAAAsqkAAAAAAAH6tQAAAAAAAsK1AAAAAAACUsUAAAAAAALasQAAAAAAA6KxAAAAAAACkr0AAAAAAADawQAAAAAAAYrFAAAAAAACErEAAAAAAACCsQAAAAAAAvKtAAAAAAADIqUAAAAAAAMywQAAAAAAA3K5AAAAAAADcrkAAAAAAAFirQAAAAAAApK9AAAAAAACQqkAAAAAAAOisQAAAAAAA9KpAAAAAAADorEAAAAAAAGSpQAAAAAAAyKlAAAAAAACErEAAAAAAAGSpQAAAAAAA6KxAAAAAAADuq0AAAAAAAKimQAAAAAAAGq1AAAAAAAD6qUAAAAAAAHiuQAAAAAAA9KpAAAAAAAB+rUAAAAAAAJCqQAAAAAAAIKxAAAAAAAB4rkAAAAAAAMywQAAAAAAAsK1AAAAAAABAr0AAAAAAAFirQAAAAAAAtqxAAAAAAAAEsEAAAAAAANyuQAAAAAAAkKpAAAAAAABkqUAAAAAAADawQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"flipper_length_mm\":{\"__ndarray__\":\"AAAAAABgaEAAAAAAAGBnQAAAAAAAYGdAAAAAAACgaUAAAAAAAIBnQAAAAAAAIGhAAAAAAADgZ0AAAAAAAGBoQAAAAAAAQGpAAAAAAADAaUAAAAAAACBpQAAAAAAAQGpAAAAAAABAaEAAAAAAACBoQAAAAAAAAGhAAAAAAABgaEAAAAAAACBoQAAAAAAAYGlAAAAAAABAaUAAAAAAAKBoQAAAAAAAIGhAAAAAAABgZ0AAAAAAAEBoQAAAAAAAIGhAAAAAAACgaUAAAAAAAMBoQAAAAAAAoGZAAAAAAABgaUAAAAAAAKBoQAAAAAAAIGlAAAAAAABgZ0AAAAAAAKBmQAAAAAAAgGhAAAAAAABgaEAAAAAAAIBqQAAAAAAAwGhAAAAAAABAakAAAAAAAABoQAAAAAAAIGlAAAAAAADAZ0AAAAAAAMBoQAAAAAAAoGhAAAAAAAAgZ0AAAAAAAOBnQAAAAAAAoGhAAAAAAACgZ0AAAAAAAGBnQAAAAAAA4GdAAAAAAADAZ0AAAAAAAGBnQAAAAAAAoGhAAAAAAADgaEAAAAAAAOBoQAAAAAAAwGdAAAAAAADAaEAAAAAAAEBpQAAAAAAAYGhAAAAAAACAaEAAAAAAACBpQAAAAAAAAGlAAAAAAADgaUAAAAAAAIBoQAAAAAAAgGhAAAAAAABgaUAAAAAAACBpQAAAAAAAAGlAAAAAAABAZkAAAAAAAGBoQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"index\":[68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135],\"penguin number\":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67],\"species\":[\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\"]},\"selected\":{\"id\":\"1231\"},\"selection_policy\":{\"id\":\"1230\"}},\"id\":\"1187\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"label\":{\"value\":\"Chinstrap\"},\"renderers\":[{\"id\":\"1192\"}]},\"id\":\"1203\",\"type\":\"LegendItem\"},{\"attributes\":{\"fill_color\":{\"value\":\"#2ca02c\"},\"hatch_color\":{\"value\":\"#2ca02c\"},\"line_color\":{\"value\":\"#2ca02c\"},\"x\":{\"field\":\"bill_length_mm\"},\"y\":{\"field\":\"flipper_length_mm\"}},\"id\":\"1196\",\"type\":\"Circle\"}],\"root_ids\":[\"1149\"]},\"title\":\"Bokeh Application\",\"version\":\"2.4.3\"}};\n", + " const render_items = [{\"docid\":\"2647b03c-76bb-4c14-ab26-4a64702e9ccd\",\"root_ids\":[\"1149\"],\"roots\":{\"1149\":\"a0534c4a-e424-44e8-b316-e14b0900cab6\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " let attempts = 0;\n", + " const timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1149" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "# Create figure\n", + "p = bokeh.plotting.figure(\n", + " frame_width=300,\n", + " frame_height=300,\n", + " x_axis_label=\"bill length (mm)\",\n", + " y_axis_label=\"flipper length (mm)\",\n", + " toolbar_location=\"above\",\n", + ")\n", + "\n", + "# Build legend as we populate glyphs\n", + "legend_items = []\n", + "for color, (species, g) in zip(bokeh.palettes.Category10_3, df.groupby(\"species\")):\n", + " glyph = p.circle(source=g, x=\"bill_length_mm\", y=\"flipper_length_mm\", color=color)\n", + " legend_items.append((species, [glyph]))\n", + "\n", + "# Place legend\n", + "legend = bokeh.models.Legend(items=legend_items, location=\"center\")\n", + "p.add_layout(legend, \"right\")\n", + "p.legend.click_policy = \"hide\"\n", + "\n", + "bokeh.io.show(p)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## An important note about tidiness\n", + "\n", + "It is important to note that there is more than one way to make a data set tidy. In the example of the Palmer penguin data set, we saw two legitimate ways of making the data frame tidy. In our preferred version, each row corresponded to a measurement of a single _penguin_, which had several variables associated with it. In another version, each row corresponded to a single _feature_ of a penguin.\n", + "\n", + "To demonstrate that this latter version is workable, but more cumbersome, we can make the same plots as above. First, we'll melt it again." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
penguin numberspeciesvariablevalue
00Adeliebill_depth_mm18.5
11Adeliebill_depth_mm16.9
22Adeliebill_depth_mm19.5
33Adeliebill_depth_mm18.3
44Adeliebill_depth_mm18.0
\n", + "
" + ], + "text/plain": [ + " penguin number species variable value\n", + "0 0 Adelie bill_depth_mm 18.5\n", + "1 1 Adelie bill_depth_mm 16.9\n", + "2 2 Adelie bill_depth_mm 19.5\n", + "3 3 Adelie bill_depth_mm 18.3\n", + "4 4 Adelie bill_depth_mm 18.0" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.melt(id_vars=[\"penguin number\", \"species\"])\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Plotting the ECDFs is not really a problem with this form of the data frame. We just need to use Boolean indexing to pull out the bill length rows." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " const docs_json = {\"31a89ea4-6146-459d-91db-13d391278761\":{\"defs\":[],\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"1319\"}],\"center\":[{\"id\":\"1322\"},{\"id\":\"1326\"}],\"frame_height\":275,\"frame_width\":400,\"left\":[{\"id\":\"1323\"}],\"renderers\":[{\"id\":\"1346\"},{\"id\":\"1353\"},{\"id\":\"1360\"}],\"right\":[{\"id\":\"1362\"}],\"title\":{\"id\":\"1396\"},\"toolbar\":{\"id\":\"1334\"},\"toolbar_location\":\"above\",\"x_range\":{\"id\":\"1311\"},\"x_scale\":{\"id\":\"1315\"},\"y_range\":{\"id\":\"1313\"},\"y_scale\":{\"id\":\"1317\"}},\"id\":\"1310\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.2},\"fill_color\":{\"value\":\"#ff7e0e\"},\"hatch_alpha\":{\"value\":0.2},\"line_alpha\":{\"value\":0.2},\"line_color\":{\"value\":\"#ff7e0e\"},\"x\":{\"field\":\"value\"},\"y\":{\"field\":\"__ECDF\"}},\"id\":\"1352\",\"type\":\"Circle\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b3\"},\"line_color\":{\"value\":\"#1f77b3\"},\"x\":{\"field\":\"value\"},\"y\":{\"field\":\"__ECDF\"}},\"id\":\"1343\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1320\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis\":{\"id\":\"1319\"},\"coordinates\":null,\"group\":null,\"ticker\":null},\"id\":\"1322\",\"type\":\"Grid\"},{\"attributes\":{\"coordinates\":null,\"data_source\":{\"id\":\"1341\"},\"glyph\":{\"id\":\"1343\"},\"group\":null,\"hover_glyph\":null,\"muted_glyph\":{\"id\":\"1345\"},\"nonselection_glyph\":{\"id\":\"1344\"},\"view\":{\"id\":\"1347\"}},\"id\":\"1346\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"bottom_units\":\"screen\",\"coordinates\":null,\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"group\":null,\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"syncable\":false,\"top_units\":\"screen\"},\"id\":\"1333\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"click_policy\":\"hide\",\"coordinates\":null,\"group\":null,\"items\":[{\"id\":\"1363\"},{\"id\":\"1364\"},{\"id\":\"1365\"}],\"location\":\"center\"},\"id\":\"1362\",\"type\":\"Legend\"},{\"attributes\":{\"fill_color\":{\"value\":\"#2ba02b\"},\"line_color\":{\"value\":\"#2ba02b\"},\"x\":{\"field\":\"value\"},\"y\":{\"field\":\"__ECDF\"}},\"id\":\"1357\",\"type\":\"Circle\"},{\"attributes\":{\"coordinates\":null,\"group\":null},\"id\":\"1396\",\"type\":\"Title\"},{\"attributes\":{\"label\":{\"value\":\"Adelie\"},\"renderers\":[{\"id\":\"1346\"}]},\"id\":\"1363\",\"type\":\"LegendItem\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.2},\"fill_color\":{\"value\":\"#1f77b3\"},\"hatch_alpha\":{\"value\":0.2},\"line_alpha\":{\"value\":0.2},\"line_color\":{\"value\":\"#1f77b3\"},\"x\":{\"field\":\"value\"},\"y\":{\"field\":\"__ECDF\"}},\"id\":\"1345\",\"type\":\"Circle\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.2},\"fill_color\":{\"value\":\"#2ba02b\"},\"hatch_alpha\":{\"value\":0.2},\"line_alpha\":{\"value\":0.2},\"line_color\":{\"value\":\"#2ba02b\"},\"x\":{\"field\":\"value\"},\"y\":{\"field\":\"__ECDF\"}},\"id\":\"1359\",\"type\":\"Circle\"},{\"attributes\":{\"data\":{\"__ECDF\":{\"__ndarray__\":\"pqWlpaWl1T+XlpaWlpbWPzw8PDw8POw/LS0tLS0t7T8eHh4eHh6+Px4eHh4eHuY/8fDw8PDw6D/Ew8PDw8PTP5eWlpaWlqY/iIeHh4eH5z/x8PDw8PDAP9PS0tLS0rI/eHh4eHh46D9paWlpaWnZPzw8PDw8PNw/4uHh4eHh6T+1tLS0tLTsP3h4eHh4eMg/LS0tLS0t5T9aWlpaWlrKP+Lh4eHh4eE/AAAAAAAA4D94eHh4eHjgP2lpaWlpaeE/xMPDw8PD6z8PDw8PDw/fP7W0tLS0tOQ/WlpaWlpa4j9LS0tLS0vjP9PS0tLS0sI/Hh4eHh4ezj8AAAAAAADwP1paWlpaWuo/Hh4eHh4e3j/T0tLS0tLqPx4eHh4eHu4/WlpaWlpa2j8PDw8PDw/nP7W0tLS0tNQ/Hh4eHh4ejj+XlpaWlpa2P4iHh4eHh9c/4uHh4eHh0T9paWlpaWnpP5eWlpaWlu4/Dw8PDw8P7z8AAAAAAADQP/Hw8PDw8OA/Hh4eHh4enj+mpaWlpaXlP7W0tLS0tMQ/l5aWlpaWxj9LS0tLS0vbP9PS0tLS0tI/LS0tLS0t3T94eHh4eHjYPzw8PDw8PMw/xMPDw8PD4z88PDw8PDzkPx4eHh4eHq4/8fDw8PDw0D/T0tLS0tLiP6alpaWlpe0/S0tLS0tL6z+XlpaWlpbmPwAAAAAAAOg/iIeHh4eH7z9aWlpaWlq6Pw==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"__label\":[\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\"],\"index\":[204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271],\"penguin number\":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67],\"species\":[\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\",\"Adelie\"],\"value\":{\"__ndarray__\":\"ZmZmZmZmQkAAAAAAAIBCQAAAAAAAAEVAmpmZmZlZRUCamZmZmdlBQGZmZmZm5kNAZmZmZmZmREDNzMzMzExCQM3MzMzMDEFAZmZmZmYmRECamZmZmdlBQM3MzMzMTEFAzczMzMxMREBmZmZmZqZCQAAAAAAAwEJAzczMzMyMREAAAAAAAABFQJqZmZmZGUJAzczMzMzMQ0CamZmZmRlCQM3MzMzMDENAZmZmZmbmQkBmZmZmZuZCQDMzMzMz80JAAAAAAADARECamZmZmdlCQAAAAAAAwENAzczMzMwMQ0AzMzMzM3NDQDMzMzMz80FAZmZmZmYmQkAAAAAAAABHQM3MzMzMjERAzczMzMzMQkDNzMzMzIxEQJqZmZmZmUVAZmZmZmamQkCamZmZmRlEQJqZmZmZWUJAAAAAAADAQEAAAAAAAIBBQAAAAAAAgEJAAAAAAABAQkAzMzMzM3NEQJqZmZmZmUVAzczMzMwMRkAzMzMzMzNCQGZmZmZm5kJAAAAAAAAAQUDNzMzMzMxDQAAAAAAAAEJAAAAAAAAAQkBmZmZmZqZCQAAAAAAAQEJAAAAAAADAQkCamZmZmZlCQJqZmZmZGUJAMzMzMzNzQ0AAAAAAAIBDQAAAAAAAQEFAMzMzMzMzQkDNzMzMzAxDQGZmZmZmZkVAzczMzMyMREDNzMzMzAxEQAAAAAAAQERAZmZmZmbmRkCamZmZmZlBQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"variable\":[\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\"]},\"selected\":{\"id\":\"1405\"},\"selection_policy\":{\"id\":\"1404\"}},\"id\":\"1341\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b3\"},\"hatch_alpha\":{\"value\":0.1},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#1f77b3\"},\"x\":{\"field\":\"value\"},\"y\":{\"field\":\"__ECDF\"}},\"id\":\"1344\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1404\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"source\":{\"id\":\"1355\"}},\"id\":\"1361\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1324\",\"type\":\"BasicTicker\"},{\"attributes\":{\"source\":{\"id\":\"1341\"}},\"id\":\"1347\",\"type\":\"CDSView\"},{\"attributes\":{\"label\":{\"value\":\"Gentoo\"},\"renderers\":[{\"id\":\"1360\"}]},\"id\":\"1365\",\"type\":\"LegendItem\"},{\"attributes\":{\"data\":{\"__ECDF\":{\"__ndarray__\":\"S0tLS0tL4z9aWlpaWlraP2lpaWlpaeE/tbS0tLS05D8tLS0tLS3lP/Hw8PDw8Og/AAAAAAAA6D+Ih4eHh4fnP+Lh4eHh4dE/Hh4eHh4e5j9LS0tLS0vbPzw8PDw8POQ/PDw8PDw83D88PDw8PDzsP7W0tLS0tNQ/aWlpaWlp6T+XlpaWlpamP1paWlpaWso/eHh4eHh46D88PDw8PDzMP+Lh4eHh4ek/Hh4eHh4ejj8PDw8PDw/vP9PS0tLS0uo/LS0tLS0t7T+XlpaWlpbmP9PS0tLS0rI/xMPDw8PD6z9aWlpaWlrqP5eWlpaWlu4/eHh4eHh42D8AAAAAAADgP4iHh4eHh+8/l5aWlpaW1j/T0tLS0tLSPw8PDw8PD+c/pqWlpaWl7T8tLS0tLS3dP7W0tLS0tMQ/l5aWlpaWxj/Ew8PDw8PTP9PS0tLS0sI/8fDw8PDw4D8eHh4eHh7OP+Lh4eHh4eE/8fDw8PDw0D+mpaWlpaXlPw8PDw8PD98/WlpaWlpauj+XlpaWlpa2Px4eHh4eHr4/AAAAAAAA0D9aWlpaWlriP2lpaWlpadk/8fDw8PDwwD+Ih4eHh4fXPx4eHh4eHq4/Hh4eHh4enj+1tLS0tLTsPx4eHh4eHu4/Hh4eHh4e3j9LS0tLS0vrP9PS0tLS0uI/xMPDw8PD4z94eHh4eHjgPwAAAAAAAPA/eHh4eHh4yD+mpaWlpaXVPw==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"__label\":[\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\"],\"index\":[340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407],\"penguin number\":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67],\"species\":[\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\",\"Gentoo\"],\"value\":{\"__ndarray__\":\"MzMzMzMzSEBmZmZmZiZHQAAAAAAAwEdAmpmZmZlZSECamZmZmVlIQM3MzMzMzEhAZmZmZmamSECamZmZmZlIQAAAAAAAwEZAZmZmZmZmSEAzMzMzMzNHQM3MzMzMTEhAAAAAAABAR0BmZmZmZmZJQJqZmZmZ2UZAAAAAAAAASUCamZmZmVlFQDMzMzMzc0ZAAAAAAADASECamZmZmZlGQAAAAAAAAElAMzMzMzNzREAzMzMzM7NKQM3MzMzMDElAzczMzMyMSUAAAAAAAIBIQDMzMzMzc0VAAAAAAABASUAAAAAAAABJQAAAAAAAQEpAmpmZmZkZR0BmZmZmZmZHQM3MzMzMjEtAzczMzMwMR0AAAAAAAMBGQM3MzMzMjEhAzczMzMwMSkAAAAAAAEBHQAAAAAAAQEZAAAAAAABARkAAAAAAAMBGQDMzMzMzM0ZAmpmZmZmZR0CamZmZmZlGQM3MzMzMDEhAZmZmZmamRkCamZmZmVlIQM3MzMzMTEdAZmZmZmamRUCamZmZmZlFQAAAAAAAwEVAmpmZmZmZRkCamZmZmRlIQJqZmZmZGUdAAAAAAAAARkDNzMzMzAxHQGZmZmZmZkVAzczMzMxMRUBmZmZmZmZJQJqZmZmZGUpAAAAAAABAR0CamZmZmRlJQJqZmZmZGUhAMzMzMzMzSEBmZmZmZmZHQDMzMzMz80tAAAAAAABARkBmZmZmZuZGQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"variable\":[\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\"]},\"selected\":{\"id\":\"1409\"},\"selection_policy\":{\"id\":\"1408\"}},\"id\":\"1355\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"axis_label\":\"ECDF\",\"coordinates\":null,\"formatter\":{\"id\":\"1399\"},\"group\":null,\"major_label_policy\":{\"id\":\"1400\"},\"ticker\":{\"id\":\"1324\"}},\"id\":\"1323\",\"type\":\"LinearAxis\"},{\"attributes\":{\"data\":{\"__ECDF\":{\"__ndarray__\":\"aWlpaWlp2T+XlpaWlpamPx4eHh4eHo4/Hh4eHh4e7j/x8PDw8PDAP3h4eHh4eOA/S0tLS0tL2z8eHh4eHh7ePx4eHh4eHuY/S0tLS0tL6z/Ew8PDw8PrPzw8PDw8POw/tbS0tLS0xD+XlpaWlpbGP5eWlpaWltY/eHh4eHh4yD94eHh4eHjoPw8PDw8PD98/09LS0tLS4j/x8PDw8PDoPy0tLS0tLeU/Hh4eHh4erj/T0tLS0tLqP8TDw8PDw9M/l5aWlpaW7j9paWlpaWnhPwAAAAAAAPA/pqWlpaWl5T+1tLS0tLTsP5eWlpaWluY/AAAAAAAA0D8eHh4eHh6ePw8PDw8PD+c/tbS0tLS01D88PDw8PDzcP1paWlpaWro/LS0tLS0t3T/T0tLS0tLSPzw8PDw8POQ/WlpaWlpa4j9paWlpaWnpPy0tLS0tLe0/iIeHh4eH1z8eHh4eHh6+P8TDw8PDw+M/pqWlpaWl1T9aWlpaWlrqP/Hw8PDw8NA/WlpaWlpayj/T0tLS0tKyP6alpaWlpe0/WlpaWlpa2j94eHh4eHjYP+Lh4eHh4dE/S0tLS0tL4z+XlpaWlpa2P/Hw8PDw8OA/4uHh4eHh4T8PDw8PDw/vPwAAAAAAAOA/iIeHh4eH7z/T0tLS0tLCP4iHh4eHh+c/AAAAAAAA6D/i4eHh4eHpP7W0tLS0tOQ/Hh4eHh4ezj88PDw8PDzMPw==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"__label\":[\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\"],\"index\":[272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339],\"penguin number\":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67],\"species\":[\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\",\"Chinstrap\"],\"value\":{\"__ndarray__\":\"zczMzMzMR0AAAAAAAEBFQDMzMzMzc0RAZmZmZmZmSkAzMzMzM7NGQM3MzMzMzEhAAAAAAABASECamZmZmZlIQGZmZmZmZklAMzMzMzPzSUAAAAAAAABKQAAAAAAAAEpAzczMzMzMRkCamZmZmdlGQDMzMzMzc0dAmpmZmZnZRkBmZmZmZqZJQGZmZmZmpkhAmpmZmZkZSUBmZmZmZqZJQM3MzMzMTElAAAAAAABARUCamZmZmdlJQM3MzMzMTEdAAAAAAADASkBmZmZmZuZIQAAAAAAAAE1AmpmZmZlZSUAAAAAAAABKQGZmZmZmZklAmpmZmZkZR0AzMzMzMzNFQDMzMzMzc0lAmpmZmZlZR0AAAAAAAIBIQJqZmZmZmUZAAAAAAACASEAAAAAAAEBHQAAAAAAAQElAzczMzMwMSUBmZmZmZqZJQJqZmZmZGUpAAAAAAACAR0CamZmZmZlGQGZmZmZmJklAZmZmZmZmR0AAAAAAAMBJQDMzMzMzM0dAMzMzMzPzRkCamZmZmZlFQJqZmZmZWUpAzczMzMwMSEAAAAAAAMBHQDMzMzMzM0dAmpmZmZkZSUAAAAAAAMBFQJqZmZmZ2UhAAAAAAAAASUCamZmZmRlLQAAAAAAAwEhAZmZmZmbmS0AAAAAAAMBGQDMzMzMzc0lAAAAAAACASUAzMzMzM7NJQAAAAAAAQElAzczMzMwMR0AAAAAAAABHQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"variable\":[\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\",\"bill_length_mm\"]},\"selected\":{\"id\":\"1407\"},\"selection_policy\":{\"id\":\"1406\"}},\"id\":\"1348\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1408\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1405\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1409\",\"type\":\"Selection\"},{\"attributes\":{\"coordinates\":null,\"data_source\":{\"id\":\"1355\"},\"glyph\":{\"id\":\"1357\"},\"group\":null,\"hover_glyph\":null,\"muted_glyph\":{\"id\":\"1359\"},\"nonselection_glyph\":{\"id\":\"1358\"},\"view\":{\"id\":\"1361\"}},\"id\":\"1360\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1332\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"1327\",\"type\":\"PanTool\"},{\"attributes\":{\"axis\":{\"id\":\"1323\"},\"coordinates\":null,\"dimension\":1,\"group\":null,\"ticker\":null},\"id\":\"1326\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"1330\",\"type\":\"SaveTool\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#2ba02b\"},\"hatch_alpha\":{\"value\":0.1},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#2ba02b\"},\"x\":{\"field\":\"value\"},\"y\":{\"field\":\"__ECDF\"}},\"id\":\"1358\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1331\",\"type\":\"ResetTool\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#ff7e0e\"},\"hatch_alpha\":{\"value\":0.1},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#ff7e0e\"},\"x\":{\"field\":\"value\"},\"y\":{\"field\":\"__ECDF\"}},\"id\":\"1351\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1311\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1328\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"1400\",\"type\":\"AllLabels\"},{\"attributes\":{},\"id\":\"1315\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1317\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1403\",\"type\":\"AllLabels\"},{\"attributes\":{},\"id\":\"1313\",\"type\":\"DataRange1d\"},{\"attributes\":{\"label\":{\"value\":\"Chinstrap\"},\"renderers\":[{\"id\":\"1353\"}]},\"id\":\"1364\",\"type\":\"LegendItem\"},{\"attributes\":{},\"id\":\"1406\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"tools\":[{\"id\":\"1327\"},{\"id\":\"1328\"},{\"id\":\"1329\"},{\"id\":\"1330\"},{\"id\":\"1331\"},{\"id\":\"1332\"}]},\"id\":\"1334\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"1407\",\"type\":\"Selection\"},{\"attributes\":{\"overlay\":{\"id\":\"1333\"}},\"id\":\"1329\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"1399\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"fill_color\":{\"value\":\"#ff7e0e\"},\"line_color\":{\"value\":\"#ff7e0e\"},\"x\":{\"field\":\"value\"},\"y\":{\"field\":\"__ECDF\"}},\"id\":\"1350\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1402\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"coordinates\":null,\"data_source\":{\"id\":\"1348\"},\"glyph\":{\"id\":\"1350\"},\"group\":null,\"hover_glyph\":null,\"muted_glyph\":{\"id\":\"1352\"},\"nonselection_glyph\":{\"id\":\"1351\"},\"view\":{\"id\":\"1354\"}},\"id\":\"1353\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis_label\":\"bill length (mm)\",\"coordinates\":null,\"formatter\":{\"id\":\"1402\"},\"group\":null,\"major_label_policy\":{\"id\":\"1403\"},\"ticker\":{\"id\":\"1320\"}},\"id\":\"1319\",\"type\":\"LinearAxis\"},{\"attributes\":{\"source\":{\"id\":\"1348\"}},\"id\":\"1354\",\"type\":\"CDSView\"}],\"root_ids\":[\"1310\"]},\"title\":\"Bokeh Application\",\"version\":\"2.4.3\"}};\n", + " const render_items = [{\"docid\":\"31a89ea4-6146-459d-91db-13d391278761\",\"root_ids\":[\"1310\"],\"roots\":{\"1310\":\"827bdc96-00cb-4f6f-9d19-cbebc61665ee\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " let attempts = 0;\n", + " const timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1310" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "bokeh.io.show(\n", + " iqplot.ecdf(\n", + " data=df.loc[df[\"variable\"] == \"bill_length_mm\", :],\n", + " q=\"value\",\n", + " cats=\"species\",\n", + " frame_width=400,\n", + " x_axis_label=\"bill length (mm)\",\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Making the scatter plot, however, is much more difficult and involves a lot of Boolean indexing by hand." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " const docs_json = {\"3c278beb-0c4b-4a08-a148-5229d0a681f7\":{\"defs\":[],\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"1495\"}],\"center\":[{\"id\":\"1498\"},{\"id\":\"1502\"}],\"frame_height\":300,\"frame_width\":300,\"left\":[{\"id\":\"1499\"}],\"renderers\":[{\"id\":\"1521\"},{\"id\":\"1527\"},{\"id\":\"1533\"}],\"right\":[{\"id\":\"1535\"}],\"title\":{\"id\":\"1584\"},\"toolbar\":{\"id\":\"1510\"},\"toolbar_location\":\"above\",\"x_range\":{\"id\":\"1487\"},\"x_scale\":{\"id\":\"1491\"},\"y_range\":{\"id\":\"1489\"},\"y_scale\":{\"id\":\"1493\"}},\"id\":\"1486\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"source\":{\"id\":\"1517\"}},\"id\":\"1522\",\"type\":\"CDSView\"},{\"attributes\":{\"data\":{\"x\":{\"__ndarray__\":\"ZmZmZmZmQkAAAAAAAIBCQAAAAAAAAEVAmpmZmZlZRUCamZmZmdlBQGZmZmZm5kNAZmZmZmZmREDNzMzMzExCQM3MzMzMDEFAZmZmZmYmRECamZmZmdlBQM3MzMzMTEFAzczMzMxMREBmZmZmZqZCQAAAAAAAwEJAzczMzMyMREAAAAAAAABFQJqZmZmZGUJAzczMzMzMQ0CamZmZmRlCQM3MzMzMDENAZmZmZmbmQkBmZmZmZuZCQDMzMzMz80JAAAAAAADARECamZmZmdlCQAAAAAAAwENAzczMzMwMQ0AzMzMzM3NDQDMzMzMz80FAZmZmZmYmQkAAAAAAAABHQM3MzMzMjERAzczMzMzMQkDNzMzMzIxEQJqZmZmZmUVAZmZmZmamQkCamZmZmRlEQJqZmZmZWUJAAAAAAADAQEAAAAAAAIBBQAAAAAAAgEJAAAAAAABAQkAzMzMzM3NEQJqZmZmZmUVAzczMzMwMRkAzMzMzMzNCQGZmZmZm5kJAAAAAAAAAQUDNzMzMzMxDQAAAAAAAAEJAAAAAAAAAQkBmZmZmZqZCQAAAAAAAQEJAAAAAAADAQkCamZmZmZlCQJqZmZmZGUJAMzMzMzNzQ0AAAAAAAIBDQAAAAAAAQEFAMzMzMzMzQkDNzMzMzAxDQGZmZmZmZkVAzczMzMyMREDNzMzMzAxEQAAAAAAAQERAZmZmZmbmRkCamZmZmZlBQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"y\":{\"__ndarray__\":\"AAAAAAAgaEAAAAAAACBnQAAAAAAAAGlAAAAAAACAaEAAAAAAAEBpQAAAAAAAAGdAAAAAAABgaEAAAAAAAABnQAAAAAAAIGhAAAAAAABgaEAAAAAAACBnQAAAAAAAoGdAAAAAAADgaEAAAAAAAOBnQAAAAAAA4GhAAAAAAACAZ0AAAAAAAMBnQAAAAAAAYGdAAAAAAADAZ0AAAAAAAGBnQAAAAAAAoGZAAAAAAAAgaEAAAAAAAEBnQAAAAAAAgGVAAAAAAABgaEAAAAAAAMBoQAAAAAAAgGdAAAAAAADAaEAAAAAAAKBmQAAAAAAAwGdAAAAAAADAZ0AAAAAAAEBoQAAAAAAAwGZAAAAAAABAaEAAAAAAAABoQAAAAAAAoGhAAAAAAAAAaEAAAAAAAABmQAAAAAAAIGhAAAAAAADAZ0AAAAAAAABoQAAAAAAAIGdAAAAAAACgZkAAAAAAAOBnQAAAAAAAAGhAAAAAAABAakAAAAAAAGBoQAAAAAAAgGZAAAAAAAAgZ0AAAAAAAOBnQAAAAAAAYGdAAAAAAADAZ0AAAAAAAOBoQAAAAAAAwGZAAAAAAABgZkAAAAAAAEBmQAAAAAAAYGdAAAAAAADAZ0AAAAAAACBnQAAAAAAAYGdAAAAAAAAAZ0AAAAAAAGBnQAAAAAAAYGhAAAAAAACgZ0AAAAAAAIBnQAAAAAAAYGdAAAAAAACgaEAAAAAAAEBnQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]}},\"selected\":{\"id\":\"1593\"},\"selection_policy\":{\"id\":\"1592\"}},\"id\":\"1517\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"click_policy\":\"hide\",\"coordinates\":null,\"group\":null,\"items\":[{\"id\":\"1536\"},{\"id\":\"1537\"},{\"id\":\"1538\"}],\"location\":\"center\"},\"id\":\"1535\",\"type\":\"Legend\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#ff7f0e\"},\"hatch_alpha\":{\"value\":0.1},\"hatch_color\":{\"value\":\"#ff7f0e\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#ff7f0e\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1525\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1496\",\"type\":\"BasicTicker\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"hatch_color\":{\"value\":\"#1f77b4\"},\"line_color\":{\"value\":\"#1f77b4\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1518\",\"type\":\"Circle\"},{\"attributes\":{\"axis\":{\"id\":\"1495\"},\"coordinates\":null,\"group\":null,\"ticker\":null},\"id\":\"1498\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"1592\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"bottom_units\":\"screen\",\"coordinates\":null,\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"group\":null,\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"syncable\":false,\"top_units\":\"screen\"},\"id\":\"1509\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"1504\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"1596\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"coordinates\":null,\"data_source\":{\"id\":\"1523\"},\"glyph\":{\"id\":\"1524\"},\"group\":null,\"hover_glyph\":null,\"muted_glyph\":{\"id\":\"1526\"},\"nonselection_glyph\":{\"id\":\"1525\"},\"view\":{\"id\":\"1528\"}},\"id\":\"1527\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis_label\":\"flipper length (mm)\",\"coordinates\":null,\"formatter\":{\"id\":\"1587\"},\"group\":null,\"major_label_policy\":{\"id\":\"1588\"},\"ticker\":{\"id\":\"1500\"}},\"id\":\"1499\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1593\",\"type\":\"Selection\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.2},\"fill_color\":{\"value\":\"#1f77b4\"},\"hatch_alpha\":{\"value\":0.2},\"hatch_color\":{\"value\":\"#1f77b4\"},\"line_alpha\":{\"value\":0.2},\"line_color\":{\"value\":\"#1f77b4\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1520\",\"type\":\"Circle\"},{\"attributes\":{\"axis\":{\"id\":\"1499\"},\"coordinates\":null,\"dimension\":1,\"group\":null,\"ticker\":null},\"id\":\"1502\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"1597\",\"type\":\"Selection\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"hatch_alpha\":{\"value\":0.1},\"hatch_color\":{\"value\":\"#1f77b4\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#1f77b4\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1519\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1500\",\"type\":\"BasicTicker\"},{\"attributes\":{\"label\":{\"value\":\"Adelie\"},\"renderers\":[{\"id\":\"1521\"}]},\"id\":\"1536\",\"type\":\"LegendItem\"},{\"attributes\":{\"coordinates\":null,\"data_source\":{\"id\":\"1517\"},\"glyph\":{\"id\":\"1518\"},\"group\":null,\"hover_glyph\":null,\"muted_glyph\":{\"id\":\"1520\"},\"nonselection_glyph\":{\"id\":\"1519\"},\"view\":{\"id\":\"1522\"}},\"id\":\"1521\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1508\",\"type\":\"HelpTool\"},{\"attributes\":{\"label\":{\"value\":\"Gentoo\"},\"renderers\":[{\"id\":\"1533\"}]},\"id\":\"1538\",\"type\":\"LegendItem\"},{\"attributes\":{},\"id\":\"1503\",\"type\":\"PanTool\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.2},\"fill_color\":{\"value\":\"#2ca02c\"},\"hatch_alpha\":{\"value\":0.2},\"hatch_color\":{\"value\":\"#2ca02c\"},\"line_alpha\":{\"value\":0.2},\"line_color\":{\"value\":\"#2ca02c\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1532\",\"type\":\"Circle\"},{\"attributes\":{\"overlay\":{\"id\":\"1509\"}},\"id\":\"1505\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#2ca02c\"},\"hatch_alpha\":{\"value\":0.1},\"hatch_color\":{\"value\":\"#2ca02c\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#2ca02c\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1531\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1506\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"1507\",\"type\":\"ResetTool\"},{\"attributes\":{\"coordinates\":null,\"data_source\":{\"id\":\"1529\"},\"glyph\":{\"id\":\"1530\"},\"group\":null,\"hover_glyph\":null,\"muted_glyph\":{\"id\":\"1532\"},\"nonselection_glyph\":{\"id\":\"1531\"},\"view\":{\"id\":\"1534\"}},\"id\":\"1533\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"tools\":[{\"id\":\"1503\"},{\"id\":\"1504\"},{\"id\":\"1505\"},{\"id\":\"1506\"},{\"id\":\"1507\"},{\"id\":\"1508\"}]},\"id\":\"1510\",\"type\":\"Toolbar\"},{\"attributes\":{\"label\":{\"value\":\"Chinstrap\"},\"renderers\":[{\"id\":\"1527\"}]},\"id\":\"1537\",\"type\":\"LegendItem\"},{\"attributes\":{},\"id\":\"1587\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"data\":{\"x\":{\"__ndarray__\":\"zczMzMzMR0AAAAAAAEBFQDMzMzMzc0RAZmZmZmZmSkAzMzMzM7NGQM3MzMzMzEhAAAAAAABASECamZmZmZlIQGZmZmZmZklAMzMzMzPzSUAAAAAAAABKQAAAAAAAAEpAzczMzMzMRkCamZmZmdlGQDMzMzMzc0dAmpmZmZnZRkBmZmZmZqZJQGZmZmZmpkhAmpmZmZkZSUBmZmZmZqZJQM3MzMzMTElAAAAAAABARUCamZmZmdlJQM3MzMzMTEdAAAAAAADASkBmZmZmZuZIQAAAAAAAAE1AmpmZmZlZSUAAAAAAAABKQGZmZmZmZklAmpmZmZkZR0AzMzMzMzNFQDMzMzMzc0lAmpmZmZlZR0AAAAAAAIBIQJqZmZmZmUZAAAAAAACASEAAAAAAAEBHQAAAAAAAQElAzczMzMwMSUBmZmZmZqZJQJqZmZmZGUpAAAAAAACAR0CamZmZmZlGQGZmZmZmJklAZmZmZmZmR0AAAAAAAMBJQDMzMzMzM0dAMzMzMzPzRkCamZmZmZlFQJqZmZmZWUpAzczMzMwMSEAAAAAAAMBHQDMzMzMzM0dAmpmZmZkZSUAAAAAAAMBFQJqZmZmZ2UhAAAAAAAAASUCamZmZmRlLQAAAAAAAwEhAZmZmZmbmS0AAAAAAAMBGQDMzMzMzc0lAAAAAAACASUAzMzMzM7NJQAAAAAAAQElAzczMzMwMR0AAAAAAAABHQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"y\":{\"__ndarray__\":\"AAAAAABgaEAAAAAAAGBnQAAAAAAAYGdAAAAAAACgaUAAAAAAAIBnQAAAAAAAIGhAAAAAAADgZ0AAAAAAAGBoQAAAAAAAQGpAAAAAAADAaUAAAAAAACBpQAAAAAAAQGpAAAAAAABAaEAAAAAAACBoQAAAAAAAAGhAAAAAAABgaEAAAAAAACBoQAAAAAAAYGlAAAAAAABAaUAAAAAAAKBoQAAAAAAAIGhAAAAAAABgZ0AAAAAAAEBoQAAAAAAAIGhAAAAAAACgaUAAAAAAAMBoQAAAAAAAoGZAAAAAAABgaUAAAAAAAKBoQAAAAAAAIGlAAAAAAABgZ0AAAAAAAKBmQAAAAAAAgGhAAAAAAABgaEAAAAAAAIBqQAAAAAAAwGhAAAAAAABAakAAAAAAAABoQAAAAAAAIGlAAAAAAADAZ0AAAAAAAMBoQAAAAAAAoGhAAAAAAAAgZ0AAAAAAAOBnQAAAAAAAoGhAAAAAAACgZ0AAAAAAAGBnQAAAAAAA4GdAAAAAAADAZ0AAAAAAAGBnQAAAAAAAoGhAAAAAAADgaEAAAAAAAOBoQAAAAAAAwGdAAAAAAADAaEAAAAAAAEBpQAAAAAAAYGhAAAAAAACAaEAAAAAAACBpQAAAAAAAAGlAAAAAAADgaUAAAAAAAIBoQAAAAAAAgGhAAAAAAABgaUAAAAAAACBpQAAAAAAAAGlAAAAAAABAZkAAAAAAAGBoQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]}},\"selected\":{\"id\":\"1595\"},\"selection_policy\":{\"id\":\"1594\"}},\"id\":\"1523\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"source\":{\"id\":\"1529\"}},\"id\":\"1534\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1594\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1588\",\"type\":\"AllLabels\"},{\"attributes\":{\"fill_color\":{\"value\":\"#ff7f0e\"},\"hatch_color\":{\"value\":\"#ff7f0e\"},\"line_color\":{\"value\":\"#ff7f0e\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1524\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1595\",\"type\":\"Selection\"},{\"attributes\":{\"data\":{\"x\":{\"__ndarray__\":\"MzMzMzMzSEBmZmZmZiZHQAAAAAAAwEdAmpmZmZlZSECamZmZmVlIQM3MzMzMzEhAZmZmZmamSECamZmZmZlIQAAAAAAAwEZAZmZmZmZmSEAzMzMzMzNHQM3MzMzMTEhAAAAAAABAR0BmZmZmZmZJQJqZmZmZ2UZAAAAAAAAASUCamZmZmVlFQDMzMzMzc0ZAAAAAAADASECamZmZmZlGQAAAAAAAAElAMzMzMzNzREAzMzMzM7NKQM3MzMzMDElAzczMzMyMSUAAAAAAAIBIQDMzMzMzc0VAAAAAAABASUAAAAAAAABJQAAAAAAAQEpAmpmZmZkZR0BmZmZmZmZHQM3MzMzMjEtAzczMzMwMR0AAAAAAAMBGQM3MzMzMjEhAzczMzMwMSkAAAAAAAEBHQAAAAAAAQEZAAAAAAABARkAAAAAAAMBGQDMzMzMzM0ZAmpmZmZmZR0CamZmZmZlGQM3MzMzMDEhAZmZmZmamRkCamZmZmVlIQM3MzMzMTEdAZmZmZmamRUCamZmZmZlFQAAAAAAAwEVAmpmZmZmZRkCamZmZmRlIQJqZmZmZGUdAAAAAAAAARkDNzMzMzAxHQGZmZmZmZkVAzczMzMxMRUBmZmZmZmZJQJqZmZmZGUpAAAAAAABAR0CamZmZmRlJQJqZmZmZGUhAMzMzMzMzSEBmZmZmZmZHQDMzMzMz80tAAAAAAABARkBmZmZmZuZGQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]},\"y\":{\"__ndarray__\":\"AAAAAACAa0AAAAAAAOBqQAAAAAAAIGpAAAAAAAAAakAAAAAAAEBqQAAAAAAAAGtAAAAAAAAga0AAAAAAAKBrQAAAAAAAgGtAAAAAAADAa0AAAAAAAABrQAAAAAAAwGxAAAAAAABAakAAAAAAAEBsQAAAAAAAwGpAAAAAAAAAbEAAAAAAAABqQAAAAAAAoGpAAAAAAACgbEAAAAAAAOBqQAAAAAAAgGtAAAAAAADAakAAAAAAAGBrQAAAAAAAIGxAAAAAAAAgbEAAAAAAAABrQAAAAAAA4GpAAAAAAADAa0AAAAAAAEBrQAAAAAAAoGtAAAAAAACga0AAAAAAAOBqQAAAAAAAwGxAAAAAAADgakAAAAAAAIBqQAAAAAAAgGpAAAAAAADAbEAAAAAAAKBqQAAAAAAAwGpAAAAAAAAAa0AAAAAAAEBqQAAAAAAAYGtAAAAAAADgakAAAAAAAOBqQAAAAAAAIGpAAAAAAABAakAAAAAAAMBrQAAAAAAAQGpAAAAAAAAgakAAAAAAAABqQAAAAAAAoGpAAAAAAACAakAAAAAAAEBqQAAAAAAAIGpAAAAAAAAAakAAAAAAAGBqQAAAAAAAIGpAAAAAAACgakAAAAAAAIBsQAAAAAAAgGxAAAAAAAAga0AAAAAAAEBrQAAAAAAAoGtAAAAAAACgakAAAAAAAOBqQAAAAAAAgGxAAAAAAAAga0AAAAAAAEBqQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[68]}},\"selected\":{\"id\":\"1597\"},\"selection_policy\":{\"id\":\"1596\"}},\"id\":\"1529\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1487\",\"type\":\"DataRange1d\"},{\"attributes\":{\"fill_color\":{\"value\":\"#2ca02c\"},\"hatch_color\":{\"value\":\"#2ca02c\"},\"line_color\":{\"value\":\"#2ca02c\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1530\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1590\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"axis_label\":\"bill length (mm)\",\"coordinates\":null,\"formatter\":{\"id\":\"1590\"},\"group\":null,\"major_label_policy\":{\"id\":\"1591\"},\"ticker\":{\"id\":\"1496\"}},\"id\":\"1495\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1493\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1489\",\"type\":\"DataRange1d\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.2},\"fill_color\":{\"value\":\"#ff7f0e\"},\"hatch_alpha\":{\"value\":0.2},\"hatch_color\":{\"value\":\"#ff7f0e\"},\"line_alpha\":{\"value\":0.2},\"line_color\":{\"value\":\"#ff7f0e\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1526\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1491\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1591\",\"type\":\"AllLabels\"},{\"attributes\":{\"coordinates\":null,\"group\":null},\"id\":\"1584\",\"type\":\"Title\"},{\"attributes\":{\"source\":{\"id\":\"1523\"}},\"id\":\"1528\",\"type\":\"CDSView\"}],\"root_ids\":[\"1486\"]},\"title\":\"Bokeh Application\",\"version\":\"2.4.3\"}};\n", + " const render_items = [{\"docid\":\"3c278beb-0c4b-4a08-a148-5229d0a681f7\",\"root_ids\":[\"1486\"],\"roots\":{\"1486\":\"08f1bbbd-03f4-4d4e-bd51-424f145862ef\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " let attempts = 0;\n", + " const timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1486" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "# Set up figure\n", + "p = bokeh.plotting.figure(\n", + " frame_width=300,\n", + " frame_height=300,\n", + " x_axis_label=\"bill length (mm)\",\n", + " y_axis_label=\"flipper length (mm)\",\n", + " toolbar_location=\"above\",\n", + ")\n", + "\n", + "# Build legend as we populate glyphs\n", + "legend_items = []\n", + "for color, species in zip(bokeh.palettes.Category10_3, df[\"species\"].unique()):\n", + " # Which species\n", + " species_inds = df[\"species\"] == species\n", + "\n", + " # Slice out bill and flipper lengths for species\n", + " bill_length = df.loc[(df[\"variable\"] == \"bill_length_mm\") & species_inds, \"value\"]\n", + " flipper_length = df.loc[(df[\"variable\"] == \"flipper_length_mm\") & species_inds, \"value\"]\n", + "\n", + " # Populate glyph\n", + " glyph = p.circle(bill_length, flipper_length, color=color)\n", + " legend_items.append((species, [glyph]))\n", + "\n", + "# Build and place legend\n", + "legend = bokeh.models.Legend(items=legend_items, location=\"center\")\n", + "p.add_layout(legend, \"right\")\n", + "p.legend.click_policy = \"hide\"\n", + "\n", + "bokeh.io.show(p)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This works fine, but is more cumbersome and therefore prone to error because we cannot use a groupby operation. The moral of the story is that you should tidy your data, but you should think carefully about in what way your data are tidy." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Computing environment" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Python implementation: CPython\n", + "Python version : 3.9.13\n", + "IPython version : 8.4.0\n", + "\n", + "numpy : 1.21.5\n", + "pandas : 1.4.3\n", + "bokeh : 2.4.3\n", + "iqplot : 0.3.2\n", + "jupyterlab: 3.4.4\n", + "\n" + ] + } + ], + "source": [ + "%load_ext watermark\n", + "%watermark -v -p numpy,pandas,bokeh,iqplot,jupyterlab" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/2024/exercise_solutions/exercise_1/exercise_1.1_solution.html b/2024/exercise_solutions/exercise_1/exercise_1.1_solution.html index f379d791..0359cc29 100644 --- a/2024/exercise_solutions/exercise_1/exercise_1.1_solution.html +++ b/2024/exercise_solutions/exercise_1/exercise_1.1_solution.html @@ -134,6 +134,7 @@
  • Exercise 1.6: RNA secondary structure validator
  • +
  • Exercise 2 solutions
  • Schedule

    +
  • Exercise 2 solutions
  • Schedule

    +
  • Exercise 2 solutions
  • Schedule