{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "A newer version of GraphLab Create (v1.8.5) is available! Your current version is v1.8.4.\n", "\n", "You can use pip to upgrade the graphlab-create package. For more information see https://dato.com/products/create/upgrade.\n" ] } ], "source": [ "# 先import package\n", "import graphlab" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2016-03-18 13:49:05,483 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.8.4 started. Logging: C:\\Users\\ben\\AppData\\Local\\Temp\\graphlab_server_1458280143.log.0\n" ] }, { "data": { "text/html": [ "
Finished parsing file C:\\Users\\ben\\people-example.csv
" ], "text/plain": [ "Finished parsing file C:\\Users\\ben\\people-example.csv" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
Parsing completed. Parsed 7 lines in 0.017012 secs.
" ], "text/plain": [ "Parsing completed. Parsed 7 lines in 0.017012 secs." ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "This non-commercial license of GraphLab Create is assigned to simulatedmsn@gmail.com and will expire on March 16, 2017. For commercial licensing options, visit https://dato.com/buy/.\n", "------------------------------------------------------" ] }, { "data": { "text/html": [ "
Finished parsing file C:\\Users\\ben\\people-example.csv
" ], "text/plain": [ "Finished parsing file C:\\Users\\ben\\people-example.csv" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
Parsing completed. Parsed 7 lines in 0.018013 secs.
" ], "text/plain": [ "Parsing completed. Parsed 7 lines in 0.018013 secs." ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Inferred types from first line of file as \n", "column_type_hints=[str,str,str,long]\n", "If parsing fails due to incorrect types, you can correct\n", "the inferred type list above and pass it to read_csv in\n", "the column_type_hints argument\n", "------------------------------------------------------\n" ] } ], "source": [ "# 讀取csv檔 \n", "sf = graphlab.SFrame('people-example.csv')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
First NameLast NameCountryage
BobSmithUnited States24
AliceWilliamsCanada23
MalcolmJoneEngland22
FelixBrownUSA23
AlexCooperPoland23
TodCampbellUnited States22
DerekWardSwitzerland25
\n", "[7 rows x 4 columns]
\n", "
" ], "text/plain": [ "Columns:\n", "\tFirst Name\tstr\n", "\tLast Name\tstr\n", "\tCountry\tstr\n", "\tage\tint\n", "\n", "Rows: 7\n", "\n", "Data:\n", "+------------+-----------+---------------+-----+\n", "| First Name | Last Name | Country | age |\n", "+------------+-----------+---------------+-----+\n", "| Bob | Smith | United States | 24 |\n", "| Alice | Williams | Canada | 23 |\n", "| Malcolm | Jone | England | 22 |\n", "| Felix | Brown | USA | 23 |\n", "| Alex | Cooper | Poland | 23 |\n", "| Tod | Campbell | United States | 22 |\n", "| Derek | Ward | Switzerland | 25 |\n", "+------------+-----------+---------------+-----+\n", "[7 rows x 4 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 顯示檔案的部分內容(因為這筆資料很少,所以全部都會顯示出來)\n", "sf" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Canvas is updated and available in a tab in the default browser.\n" ] } ], "source": [ "# 執行之後會顯示一個新視窗,供使用者探索該讀入的資料\n", "sf.show()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# 設定讓graphlab顯示在ipython notebook\n", "graphlab.canvas.set_target('ipynb')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "application/javascript": [ "$(\"head\").append($(\"\").attr({\n", " rel: \"stylesheet\",\n", " type: \"text/css\",\n", " href: \"//cdnjs.cloudflare.com/ajax/libs/font-awesome/4.1.0/css/font-awesome.min.css\"\n", "}));\n", "$(\"head\").append($(\"\").attr({\n", " rel: \"stylesheet\",\n", " type: \"text/css\",\n", " href: \"//dato.com/files/canvas/1.8.4/css/canvas.css\"\n", "}));\n", "\n", " (function(){\n", "\n", " var e = null;\n", " if (typeof element == 'undefined') {\n", " var scripts = document.getElementsByTagName('script');\n", " var thisScriptTag = scripts[scripts.length-1];\n", " var parentDiv = thisScriptTag.parentNode;\n", " e = document.createElement('div');\n", " parentDiv.appendChild(e);\n", " } else {\n", " e = element[0];\n", " }\n", "\n", " if (typeof requirejs !== 'undefined') {\n", " // disable load timeout; ipython_app.js is large and can take a while to load.\n", " requirejs.config({waitSeconds: 0});\n", " }\n", "\n", " require(['//dato.com/files/canvas/1.8.4/js/ipython_app.js'], function(IPythonApp){\n", " var app = new IPythonApp();\n", " app.attachView('sarray','Categorical', {\"ipython\": true, \"sketch\": {\"std\": 0.989743318610787, \"complete\": true, \"min\": 22.0, \"max\": 25.0, \"quantile\": [22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0], \"median\": 23.0, \"numeric\": true, \"num_unique\": 4, \"num_undefined\": 0, \"var\": 0.9795918367346939, \"progress\": 1.0, \"size\": 7, \"frequent_items\": {\"24\": {\"frequency\": 1, \"value\": 24}, \"25\": {\"frequency\": 1, \"value\": 25}, \"22\": {\"frequency\": 2, \"value\": 22}, \"23\": {\"frequency\": 3, \"value\": 23}}, \"mean\": 23.142857142857146}, \"selected_variable\": {\"name\": [\"\"], \"dtype\": \"int\", \"view_component\": \"Categorical\", \"view_file\": \"sarray\", \"descriptives\": {\"rows\": 7}, \"type\": \"SArray\", \"view_components\": [\"Numeric\", \"Categorical\"]}, \"histogram\": {\"progress\": 1.0, \"histogram\": {\"max\": 25.016, \"bins\": [2, 0, 0, 0, 3, 0, 0, 1, 0, 0, 0, 1], \"min\": 21.992}, \"min\": 22, \"complete\": 1, \"max\": 25}}, e);\n", " });\n", " })();\n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sf['age'].show(view='Categorical')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "dtype: str\n", "Rows: 7\n", "['United States', 'Canada', 'England', 'USA', 'Poland', 'United States', 'Switzerland']" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sf['Country']" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "23.142857142857146" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sf['age'].mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Create new columns 建立新欄位" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# 把兩個欄位合併在一起\n", "sf['Full Name'] = sf['First Name'] + ' ' + sf['Last Name']" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
First NameLast NameCountryageFull Name
BobSmithUnited States24Bob Smith
AliceWilliamsCanada23Alice Williams
MalcolmJoneEngland22Malcolm Jone
FelixBrownUSA23Felix Brown
AlexCooperPoland23Alex Cooper
TodCampbellUnited States22Tod Campbell
DerekWardSwitzerland25Derek Ward
\n", "[7 rows x 5 columns]
\n", "
" ], "text/plain": [ "Columns:\n", "\tFirst Name\tstr\n", "\tLast Name\tstr\n", "\tCountry\tstr\n", "\tage\tint\n", "\tFull Name\tstr\n", "\n", "Rows: 7\n", "\n", "Data:\n", "+------------+-----------+---------------+-----+----------------+\n", "| First Name | Last Name | Country | age | Full Name |\n", "+------------+-----------+---------------+-----+----------------+\n", "| Bob | Smith | United States | 24 | Bob Smith |\n", "| Alice | Williams | Canada | 23 | Alice Williams |\n", "| Malcolm | Jone | England | 22 | Malcolm Jone |\n", "| Felix | Brown | USA | 23 | Felix Brown |\n", "| Alex | Cooper | Poland | 23 | Alex Cooper |\n", "| Tod | Campbell | United States | 22 | Tod Campbell |\n", "| Derek | Ward | Switzerland | 25 | Derek Ward |\n", "+------------+-----------+---------------+-----+----------------+\n", "[7 rows x 5 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Use the apply function" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "dtype: str\n", "Rows: 7\n", "['United States', 'Canada', 'England', 'USA', 'Poland', 'United States', 'Switzerland']" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 顯示出來之後會發現有些是United States,有些是USA,所以我們要清理一下資料\n", "sf['Country']" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "application/javascript": [ "$(\"head\").append($(\"\").attr({\n", " rel: \"stylesheet\",\n", " type: \"text/css\",\n", " href: \"//cdnjs.cloudflare.com/ajax/libs/font-awesome/4.1.0/css/font-awesome.min.css\"\n", "}));\n", "$(\"head\").append($(\"\").attr({\n", " rel: \"stylesheet\",\n", " type: \"text/css\",\n", " href: \"//dato.com/files/canvas/1.8.4/css/canvas.css\"\n", "}));\n", "\n", " (function(){\n", "\n", " var e = null;\n", " if (typeof element == 'undefined') {\n", " var scripts = document.getElementsByTagName('script');\n", " var thisScriptTag = scripts[scripts.length-1];\n", " var parentDiv = thisScriptTag.parentNode;\n", " e = document.createElement('div');\n", " parentDiv.appendChild(e);\n", " } else {\n", " e = element[0];\n", " }\n", "\n", " if (typeof requirejs !== 'undefined') {\n", " // disable load timeout; ipython_app.js is large and can take a while to load.\n", " requirejs.config({waitSeconds: 0});\n", " }\n", "\n", " require(['//dato.com/files/canvas/1.8.4/js/ipython_app.js'], function(IPythonApp){\n", " var app = new IPythonApp();\n", " app.attachView('sarray','Categorical', {\"ipython\": true, \"sketch\": {\"complete\": true, \"numeric\": false, \"num_unique\": 6, \"num_undefined\": 0, \"progress\": 1.0, \"frequent_items\": {\"Canada\": {\"frequency\": 1, \"value\": \"Canada\"}, \"England\": {\"frequency\": 1, \"value\": \"England\"}, \"USA\": {\"frequency\": 1, \"value\": \"USA\"}, \"Poland\": {\"frequency\": 1, \"value\": \"Poland\"}, \"United States\": {\"frequency\": 2, \"value\": \"United States\"}, \"Switzerland\": {\"frequency\": 1, \"value\": \"Switzerland\"}}, \"size\": 7}, \"selected_variable\": {\"name\": [\"\"], \"dtype\": \"str\", \"view_component\": \"Categorical\", \"view_file\": \"sarray\", \"descriptives\": {\"rows\": 7}, \"type\": \"SArray\", \"view_components\": [\"Categorical\"]}, \"histogram\": null}, e);\n", " });\n", " })();\n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sf['Country'].show()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# 定義一個function\n", "def transform_country(country):\n", " if country == 'USA':\n", " return 'United States'\n", " else:\n", " return country" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'United States'" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 透過剛剛定義的function,成功的把USA轉成了United States\n", "transform_country('USA')" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "dtype: str\n", "Rows: 7\n", "['United States', 'Canada', 'England', 'United States', 'Poland', 'United States', 'Switzerland']" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 把剛剛的function應用到Country這個欄位,做整體的轉換\n", "sf['Country'].apply(transform_country)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [], "source": [ "sf['Country'] = sf['Country'].apply(transform_country)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
First NameLast NameCountryageFull Name
BobSmithUnited States24Bob Smith
AliceWilliamsCanada23Alice Williams
MalcolmJoneEngland22Malcolm Jone
FelixBrownUnited States23Felix Brown
AlexCooperPoland23Alex Cooper
TodCampbellUnited States22Tod Campbell
DerekWardSwitzerland25Derek Ward
\n", "[7 rows x 5 columns]
\n", "
" ], "text/plain": [ "Columns:\n", "\tFirst Name\tstr\n", "\tLast Name\tstr\n", "\tCountry\tstr\n", "\tage\tint\n", "\tFull Name\tstr\n", "\n", "Rows: 7\n", "\n", "Data:\n", "+------------+-----------+---------------+-----+----------------+\n", "| First Name | Last Name | Country | age | Full Name |\n", "+------------+-----------+---------------+-----+----------------+\n", "| Bob | Smith | United States | 24 | Bob Smith |\n", "| Alice | Williams | Canada | 23 | Alice Williams |\n", "| Malcolm | Jone | England | 22 | Malcolm Jone |\n", "| Felix | Brown | United States | 23 | Felix Brown |\n", "| Alex | Cooper | Poland | 23 | Alex Cooper |\n", "| Tod | Campbell | United States | 22 | Tod Campbell |\n", "| Derek | Ward | Switzerland | 25 | Derek Ward |\n", "+------------+-----------+---------------+-----+----------------+\n", "[7 rows x 5 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 第四筆資料成功地變成了United States了\n", "sf" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.11" } }, "nbformat": 4, "nbformat_minor": 0 }