{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"A newer version of GraphLab Create (v1.8.5) is available! Your current version is v1.8.4.\n",
"\n",
"You can use pip to upgrade the graphlab-create package. For more information see https://dato.com/products/create/upgrade.\n"
]
}
],
"source": [
"# 先import package\n",
"import graphlab"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2016-03-18 13:49:05,483 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.8.4 started. Logging: C:\\Users\\ben\\AppData\\Local\\Temp\\graphlab_server_1458280143.log.0\n"
]
},
{
"data": {
"text/html": [
"
Finished parsing file C:\\Users\\ben\\people-example.csv
"
],
"text/plain": [
"Finished parsing file C:\\Users\\ben\\people-example.csv"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Parsing completed. Parsed 7 lines in 0.017012 secs.
"
],
"text/plain": [
"Parsing completed. Parsed 7 lines in 0.017012 secs."
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"This non-commercial license of GraphLab Create is assigned to simulatedmsn@gmail.com and will expire on March 16, 2017. For commercial licensing options, visit https://dato.com/buy/.\n",
"------------------------------------------------------"
]
},
{
"data": {
"text/html": [
"Finished parsing file C:\\Users\\ben\\people-example.csv
"
],
"text/plain": [
"Finished parsing file C:\\Users\\ben\\people-example.csv"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Parsing completed. Parsed 7 lines in 0.018013 secs.
"
],
"text/plain": [
"Parsing completed. Parsed 7 lines in 0.018013 secs."
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Inferred types from first line of file as \n",
"column_type_hints=[str,str,str,long]\n",
"If parsing fails due to incorrect types, you can correct\n",
"the inferred type list above and pass it to read_csv in\n",
"the column_type_hints argument\n",
"------------------------------------------------------\n"
]
}
],
"source": [
"# 讀取csv檔 \n",
"sf = graphlab.SFrame('people-example.csv')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" First Name | \n",
" Last Name | \n",
" Country | \n",
" age | \n",
"
\n",
" \n",
" Bob | \n",
" Smith | \n",
" United States | \n",
" 24 | \n",
"
\n",
" \n",
" Alice | \n",
" Williams | \n",
" Canada | \n",
" 23 | \n",
"
\n",
" \n",
" Malcolm | \n",
" Jone | \n",
" England | \n",
" 22 | \n",
"
\n",
" \n",
" Felix | \n",
" Brown | \n",
" USA | \n",
" 23 | \n",
"
\n",
" \n",
" Alex | \n",
" Cooper | \n",
" Poland | \n",
" 23 | \n",
"
\n",
" \n",
" Tod | \n",
" Campbell | \n",
" United States | \n",
" 22 | \n",
"
\n",
" \n",
" Derek | \n",
" Ward | \n",
" Switzerland | \n",
" 25 | \n",
"
\n",
"
\n",
"[7 rows x 4 columns]
\n",
"
"
],
"text/plain": [
"Columns:\n",
"\tFirst Name\tstr\n",
"\tLast Name\tstr\n",
"\tCountry\tstr\n",
"\tage\tint\n",
"\n",
"Rows: 7\n",
"\n",
"Data:\n",
"+------------+-----------+---------------+-----+\n",
"| First Name | Last Name | Country | age |\n",
"+------------+-----------+---------------+-----+\n",
"| Bob | Smith | United States | 24 |\n",
"| Alice | Williams | Canada | 23 |\n",
"| Malcolm | Jone | England | 22 |\n",
"| Felix | Brown | USA | 23 |\n",
"| Alex | Cooper | Poland | 23 |\n",
"| Tod | Campbell | United States | 22 |\n",
"| Derek | Ward | Switzerland | 25 |\n",
"+------------+-----------+---------------+-----+\n",
"[7 rows x 4 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 顯示檔案的部分內容(因為這筆資料很少,所以全部都會顯示出來)\n",
"sf"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Canvas is updated and available in a tab in the default browser.\n"
]
}
],
"source": [
"# 執行之後會顯示一個新視窗,供使用者探索該讀入的資料\n",
"sf.show()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# 設定讓graphlab顯示在ipython notebook\n",
"graphlab.canvas.set_target('ipynb')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"application/javascript": [
"$(\"head\").append($(\"\").attr({\n",
" rel: \"stylesheet\",\n",
" type: \"text/css\",\n",
" href: \"//cdnjs.cloudflare.com/ajax/libs/font-awesome/4.1.0/css/font-awesome.min.css\"\n",
"}));\n",
"$(\"head\").append($(\"\").attr({\n",
" rel: \"stylesheet\",\n",
" type: \"text/css\",\n",
" href: \"//dato.com/files/canvas/1.8.4/css/canvas.css\"\n",
"}));\n",
"\n",
" (function(){\n",
"\n",
" var e = null;\n",
" if (typeof element == 'undefined') {\n",
" var scripts = document.getElementsByTagName('script');\n",
" var thisScriptTag = scripts[scripts.length-1];\n",
" var parentDiv = thisScriptTag.parentNode;\n",
" e = document.createElement('div');\n",
" parentDiv.appendChild(e);\n",
" } else {\n",
" e = element[0];\n",
" }\n",
"\n",
" if (typeof requirejs !== 'undefined') {\n",
" // disable load timeout; ipython_app.js is large and can take a while to load.\n",
" requirejs.config({waitSeconds: 0});\n",
" }\n",
"\n",
" require(['//dato.com/files/canvas/1.8.4/js/ipython_app.js'], function(IPythonApp){\n",
" var app = new IPythonApp();\n",
" app.attachView('sarray','Categorical', {\"ipython\": true, \"sketch\": {\"std\": 0.989743318610787, \"complete\": true, \"min\": 22.0, \"max\": 25.0, \"quantile\": [22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0], \"median\": 23.0, \"numeric\": true, \"num_unique\": 4, \"num_undefined\": 0, \"var\": 0.9795918367346939, \"progress\": 1.0, \"size\": 7, \"frequent_items\": {\"24\": {\"frequency\": 1, \"value\": 24}, \"25\": {\"frequency\": 1, \"value\": 25}, \"22\": {\"frequency\": 2, \"value\": 22}, \"23\": {\"frequency\": 3, \"value\": 23}}, \"mean\": 23.142857142857146}, \"selected_variable\": {\"name\": [\"\"], \"dtype\": \"int\", \"view_component\": \"Categorical\", \"view_file\": \"sarray\", \"descriptives\": {\"rows\": 7}, \"type\": \"SArray\", \"view_components\": [\"Numeric\", \"Categorical\"]}, \"histogram\": {\"progress\": 1.0, \"histogram\": {\"max\": 25.016, \"bins\": [2, 0, 0, 0, 3, 0, 0, 1, 0, 0, 0, 1], \"min\": 21.992}, \"min\": 22, \"complete\": 1, \"max\": 25}}, e);\n",
" });\n",
" })();\n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sf['age'].show(view='Categorical')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"dtype: str\n",
"Rows: 7\n",
"['United States', 'Canada', 'England', 'USA', 'Poland', 'United States', 'Switzerland']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sf['Country']"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"23.142857142857146"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sf['age'].mean()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Create new columns 建立新欄位"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# 把兩個欄位合併在一起\n",
"sf['Full Name'] = sf['First Name'] + ' ' + sf['Last Name']"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" First Name | \n",
" Last Name | \n",
" Country | \n",
" age | \n",
" Full Name | \n",
"
\n",
" \n",
" Bob | \n",
" Smith | \n",
" United States | \n",
" 24 | \n",
" Bob Smith | \n",
"
\n",
" \n",
" Alice | \n",
" Williams | \n",
" Canada | \n",
" 23 | \n",
" Alice Williams | \n",
"
\n",
" \n",
" Malcolm | \n",
" Jone | \n",
" England | \n",
" 22 | \n",
" Malcolm Jone | \n",
"
\n",
" \n",
" Felix | \n",
" Brown | \n",
" USA | \n",
" 23 | \n",
" Felix Brown | \n",
"
\n",
" \n",
" Alex | \n",
" Cooper | \n",
" Poland | \n",
" 23 | \n",
" Alex Cooper | \n",
"
\n",
" \n",
" Tod | \n",
" Campbell | \n",
" United States | \n",
" 22 | \n",
" Tod Campbell | \n",
"
\n",
" \n",
" Derek | \n",
" Ward | \n",
" Switzerland | \n",
" 25 | \n",
" Derek Ward | \n",
"
\n",
"
\n",
"[7 rows x 5 columns]
\n",
"
"
],
"text/plain": [
"Columns:\n",
"\tFirst Name\tstr\n",
"\tLast Name\tstr\n",
"\tCountry\tstr\n",
"\tage\tint\n",
"\tFull Name\tstr\n",
"\n",
"Rows: 7\n",
"\n",
"Data:\n",
"+------------+-----------+---------------+-----+----------------+\n",
"| First Name | Last Name | Country | age | Full Name |\n",
"+------------+-----------+---------------+-----+----------------+\n",
"| Bob | Smith | United States | 24 | Bob Smith |\n",
"| Alice | Williams | Canada | 23 | Alice Williams |\n",
"| Malcolm | Jone | England | 22 | Malcolm Jone |\n",
"| Felix | Brown | USA | 23 | Felix Brown |\n",
"| Alex | Cooper | Poland | 23 | Alex Cooper |\n",
"| Tod | Campbell | United States | 22 | Tod Campbell |\n",
"| Derek | Ward | Switzerland | 25 | Derek Ward |\n",
"+------------+-----------+---------------+-----+----------------+\n",
"[7 rows x 5 columns]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sf"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Use the apply function"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"dtype: str\n",
"Rows: 7\n",
"['United States', 'Canada', 'England', 'USA', 'Poland', 'United States', 'Switzerland']"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 顯示出來之後會發現有些是United States,有些是USA,所以我們要清理一下資料\n",
"sf['Country']"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"application/javascript": [
"$(\"head\").append($(\"\").attr({\n",
" rel: \"stylesheet\",\n",
" type: \"text/css\",\n",
" href: \"//cdnjs.cloudflare.com/ajax/libs/font-awesome/4.1.0/css/font-awesome.min.css\"\n",
"}));\n",
"$(\"head\").append($(\"\").attr({\n",
" rel: \"stylesheet\",\n",
" type: \"text/css\",\n",
" href: \"//dato.com/files/canvas/1.8.4/css/canvas.css\"\n",
"}));\n",
"\n",
" (function(){\n",
"\n",
" var e = null;\n",
" if (typeof element == 'undefined') {\n",
" var scripts = document.getElementsByTagName('script');\n",
" var thisScriptTag = scripts[scripts.length-1];\n",
" var parentDiv = thisScriptTag.parentNode;\n",
" e = document.createElement('div');\n",
" parentDiv.appendChild(e);\n",
" } else {\n",
" e = element[0];\n",
" }\n",
"\n",
" if (typeof requirejs !== 'undefined') {\n",
" // disable load timeout; ipython_app.js is large and can take a while to load.\n",
" requirejs.config({waitSeconds: 0});\n",
" }\n",
"\n",
" require(['//dato.com/files/canvas/1.8.4/js/ipython_app.js'], function(IPythonApp){\n",
" var app = new IPythonApp();\n",
" app.attachView('sarray','Categorical', {\"ipython\": true, \"sketch\": {\"complete\": true, \"numeric\": false, \"num_unique\": 6, \"num_undefined\": 0, \"progress\": 1.0, \"frequent_items\": {\"Canada\": {\"frequency\": 1, \"value\": \"Canada\"}, \"England\": {\"frequency\": 1, \"value\": \"England\"}, \"USA\": {\"frequency\": 1, \"value\": \"USA\"}, \"Poland\": {\"frequency\": 1, \"value\": \"Poland\"}, \"United States\": {\"frequency\": 2, \"value\": \"United States\"}, \"Switzerland\": {\"frequency\": 1, \"value\": \"Switzerland\"}}, \"size\": 7}, \"selected_variable\": {\"name\": [\"\"], \"dtype\": \"str\", \"view_component\": \"Categorical\", \"view_file\": \"sarray\", \"descriptives\": {\"rows\": 7}, \"type\": \"SArray\", \"view_components\": [\"Categorical\"]}, \"histogram\": null}, e);\n",
" });\n",
" })();\n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sf['Country'].show()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# 定義一個function\n",
"def transform_country(country):\n",
" if country == 'USA':\n",
" return 'United States'\n",
" else:\n",
" return country"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'United States'"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 透過剛剛定義的function,成功的把USA轉成了United States\n",
"transform_country('USA')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"dtype: str\n",
"Rows: 7\n",
"['United States', 'Canada', 'England', 'United States', 'Poland', 'United States', 'Switzerland']"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 把剛剛的function應用到Country這個欄位,做整體的轉換\n",
"sf['Country'].apply(transform_country)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"sf['Country'] = sf['Country'].apply(transform_country)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" First Name | \n",
" Last Name | \n",
" Country | \n",
" age | \n",
" Full Name | \n",
"
\n",
" \n",
" Bob | \n",
" Smith | \n",
" United States | \n",
" 24 | \n",
" Bob Smith | \n",
"
\n",
" \n",
" Alice | \n",
" Williams | \n",
" Canada | \n",
" 23 | \n",
" Alice Williams | \n",
"
\n",
" \n",
" Malcolm | \n",
" Jone | \n",
" England | \n",
" 22 | \n",
" Malcolm Jone | \n",
"
\n",
" \n",
" Felix | \n",
" Brown | \n",
" United States | \n",
" 23 | \n",
" Felix Brown | \n",
"
\n",
" \n",
" Alex | \n",
" Cooper | \n",
" Poland | \n",
" 23 | \n",
" Alex Cooper | \n",
"
\n",
" \n",
" Tod | \n",
" Campbell | \n",
" United States | \n",
" 22 | \n",
" Tod Campbell | \n",
"
\n",
" \n",
" Derek | \n",
" Ward | \n",
" Switzerland | \n",
" 25 | \n",
" Derek Ward | \n",
"
\n",
"
\n",
"[7 rows x 5 columns]
\n",
"
"
],
"text/plain": [
"Columns:\n",
"\tFirst Name\tstr\n",
"\tLast Name\tstr\n",
"\tCountry\tstr\n",
"\tage\tint\n",
"\tFull Name\tstr\n",
"\n",
"Rows: 7\n",
"\n",
"Data:\n",
"+------------+-----------+---------------+-----+----------------+\n",
"| First Name | Last Name | Country | age | Full Name |\n",
"+------------+-----------+---------------+-----+----------------+\n",
"| Bob | Smith | United States | 24 | Bob Smith |\n",
"| Alice | Williams | Canada | 23 | Alice Williams |\n",
"| Malcolm | Jone | England | 22 | Malcolm Jone |\n",
"| Felix | Brown | United States | 23 | Felix Brown |\n",
"| Alex | Cooper | Poland | 23 | Alex Cooper |\n",
"| Tod | Campbell | United States | 22 | Tod Campbell |\n",
"| Derek | Ward | Switzerland | 25 | Derek Ward |\n",
"+------------+-----------+---------------+-----+----------------+\n",
"[7 rows x 5 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 第四筆資料成功地變成了United States了\n",
"sf"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}