pythonbook/实验 探索Chipotle快餐数据/5.探索虚拟姓名数据.ipynb

810 lines
23 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# -- 创建DataFrame\n",
"# -- 将上述的DataFrame分别命名为data1, data2, data3\n",
"# -- 将data1和data2两个数据框按照行的维度进行合并命名为all_data\n",
"# -- 将data1和data2两个数据框按照列的维度进行合并命名为all_data_col\n",
"# -- 打印data3\n",
"# -- 按照subject_id的值对all_data和data3作合并\n",
"# -- 对data1和data2按照subject_id作连接\n",
"# -- 找到 data1 和 data2 合并之后的所有匹配结果"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"raw_data_1 = {\n",
" 'subject_id': ['1', '2', '3', '4', '5'],\n",
" 'first_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'], \n",
" 'last_name': ['Anderson', 'Ackerman', 'Ali', 'Aoni', 'Atiches']}\n",
"\n",
"raw_data_2 = {\n",
" 'subject_id': ['4', '5', '6', '7', '8'],\n",
" 'first_name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'], \n",
" 'last_name': ['Bonder', 'Black', 'Balwner', 'Brice', 'Btisan']}\n",
"\n",
"raw_data_3 = {\n",
" 'subject_id': ['1', '2', '3', '4', '5', '7', '8', '9', '10', '11'],\n",
" 'test_id': [51, 15, 15, 61, 16, 14, 15, 1, 61, 16]}"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>subject_id</th>\n",
" <th>first_name</th>\n",
" <th>last_name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Alex</td>\n",
" <td>Anderson</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Amy</td>\n",
" <td>Ackerman</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Allen</td>\n",
" <td>Ali</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Alice</td>\n",
" <td>Aoni</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Ayoung</td>\n",
" <td>Atiches</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" subject_id first_name last_name\n",
"0 1 Alex Anderson\n",
"1 2 Amy Ackerman\n",
"2 3 Allen Ali\n",
"3 4 Alice Aoni\n",
"4 5 Ayoung Atiches"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#将上述的DataFrame分别命名为data1, data2, data3\n",
"data1 = pd.DataFrame(raw_data_1)\n",
"data2 = pd.DataFrame(raw_data_2)\n",
"data3 = pd.DataFrame(raw_data_3)\n",
"data1"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>subject_id</th>\n",
" <th>first_name</th>\n",
" <th>last_name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Alex</td>\n",
" <td>Anderson</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Amy</td>\n",
" <td>Ackerman</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Allen</td>\n",
" <td>Ali</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Alice</td>\n",
" <td>Aoni</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Ayoung</td>\n",
" <td>Atiches</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4</td>\n",
" <td>Billy</td>\n",
" <td>Bonder</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>5</td>\n",
" <td>Brian</td>\n",
" <td>Black</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6</td>\n",
" <td>Bran</td>\n",
" <td>Balwner</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7</td>\n",
" <td>Bryce</td>\n",
" <td>Brice</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>8</td>\n",
" <td>Betty</td>\n",
" <td>Btisan</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" subject_id first_name last_name\n",
"0 1 Alex Anderson\n",
"1 2 Amy Ackerman\n",
"2 3 Allen Ali\n",
"3 4 Alice Aoni\n",
"4 5 Ayoung Atiches\n",
"0 4 Billy Bonder\n",
"1 5 Brian Black\n",
"2 6 Bran Balwner\n",
"3 7 Bryce Brice\n",
"4 8 Betty Btisan"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#将data1和data2两个数据框按照行的维度进行合并命名为all_data\n",
"all_data = pd.concat([data1,data2],axis=0)\n",
"all_data"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>subject_id</th>\n",
" <th>first_name</th>\n",
" <th>last_name</th>\n",
" <th>subject_id</th>\n",
" <th>first_name</th>\n",
" <th>last_name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Alex</td>\n",
" <td>Anderson</td>\n",
" <td>4</td>\n",
" <td>Billy</td>\n",
" <td>Bonder</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Amy</td>\n",
" <td>Ackerman</td>\n",
" <td>5</td>\n",
" <td>Brian</td>\n",
" <td>Black</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Allen</td>\n",
" <td>Ali</td>\n",
" <td>6</td>\n",
" <td>Bran</td>\n",
" <td>Balwner</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Alice</td>\n",
" <td>Aoni</td>\n",
" <td>7</td>\n",
" <td>Bryce</td>\n",
" <td>Brice</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Ayoung</td>\n",
" <td>Atiches</td>\n",
" <td>8</td>\n",
" <td>Betty</td>\n",
" <td>Btisan</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" subject_id first_name last_name subject_id first_name last_name\n",
"0 1 Alex Anderson 4 Billy Bonder\n",
"1 2 Amy Ackerman 5 Brian Black\n",
"2 3 Allen Ali 6 Bran Balwner\n",
"3 4 Alice Aoni 7 Bryce Brice\n",
"4 5 Ayoung Atiches 8 Betty Btisan"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#将data1和data2两个数据框按照列的维度进行合并命名为all_data_col\n",
"all_data_col = pd.concat([data1,data2],axis=1)\n",
"all_data_col"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>subject_id</th>\n",
" <th>test_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>51</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>61</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>7</td>\n",
" <td>14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>8</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>10</td>\n",
" <td>61</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>11</td>\n",
" <td>16</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" subject_id test_id\n",
"0 1 51\n",
"1 2 15\n",
"2 3 15\n",
"3 4 61\n",
"4 5 16\n",
"5 7 14\n",
"6 8 15\n",
"7 9 1\n",
"8 10 61\n",
"9 11 16"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data3"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>subject_id</th>\n",
" <th>first_name</th>\n",
" <th>last_name</th>\n",
" <th>test_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Alex</td>\n",
" <td>Anderson</td>\n",
" <td>51</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Amy</td>\n",
" <td>Ackerman</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Allen</td>\n",
" <td>Ali</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Alice</td>\n",
" <td>Aoni</td>\n",
" <td>61</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>Billy</td>\n",
" <td>Bonder</td>\n",
" <td>61</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>5</td>\n",
" <td>Ayoung</td>\n",
" <td>Atiches</td>\n",
" <td>16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>5</td>\n",
" <td>Brian</td>\n",
" <td>Black</td>\n",
" <td>16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>7</td>\n",
" <td>Bryce</td>\n",
" <td>Brice</td>\n",
" <td>14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>8</td>\n",
" <td>Betty</td>\n",
" <td>Btisan</td>\n",
" <td>15</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" subject_id first_name last_name test_id\n",
"0 1 Alex Anderson 51\n",
"1 2 Amy Ackerman 15\n",
"2 3 Allen Ali 15\n",
"3 4 Alice Aoni 61\n",
"4 4 Billy Bonder 61\n",
"5 5 Ayoung Atiches 16\n",
"6 5 Brian Black 16\n",
"7 7 Bryce Brice 14\n",
"8 8 Betty Btisan 15"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#按照subject_id的值对all_data和data3作合并\n",
"pd.merge(all_data,data3,on='subject_id')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>subject_id</th>\n",
" <th>first_name_x</th>\n",
" <th>last_name_x</th>\n",
" <th>first_name_y</th>\n",
" <th>last_name_y</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4</td>\n",
" <td>Alice</td>\n",
" <td>Aoni</td>\n",
" <td>Billy</td>\n",
" <td>Bonder</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>5</td>\n",
" <td>Ayoung</td>\n",
" <td>Atiches</td>\n",
" <td>Brian</td>\n",
" <td>Black</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" subject_id first_name_x last_name_x first_name_y last_name_y\n",
"0 4 Alice Aoni Billy Bonder\n",
"1 5 Ayoung Atiches Brian Black"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#对data1和data2按照subject_id作内连接\n",
"pd.merge(data1,data2,on='subject_id',how='inner')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>subject_id</th>\n",
" <th>first_name_data1</th>\n",
" <th>last_name_data1</th>\n",
" <th>first_name_data2</th>\n",
" <th>last_name_data2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Alex</td>\n",
" <td>Anderson</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Amy</td>\n",
" <td>Ackerman</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Allen</td>\n",
" <td>Ali</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Alice</td>\n",
" <td>Aoni</td>\n",
" <td>Billy</td>\n",
" <td>Bonder</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Ayoung</td>\n",
" <td>Atiches</td>\n",
" <td>Brian</td>\n",
" <td>Black</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>6</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Bran</td>\n",
" <td>Balwner</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>7</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Bryce</td>\n",
" <td>Brice</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>8</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Betty</td>\n",
" <td>Btisan</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" subject_id first_name_data1 last_name_data1 first_name_data2 last_name_data2\n",
"0 1 Alex Anderson NaN NaN\n",
"1 2 Amy Ackerman NaN NaN\n",
"2 3 Allen Ali NaN NaN\n",
"3 4 Alice Aoni Billy Bonder\n",
"4 5 Ayoung Atiches Brian Black\n",
"5 6 NaN NaN Bran Balwner\n",
"6 7 NaN NaN Bryce Brice\n",
"7 8 NaN NaN Betty Btisan"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#找到 data1 和 data2 合并之后的所有匹配结果\n",
"pd.merge(data1,data2,on='subject_id',how='outer',suffixes=('_data1', '_data2'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 4
}