You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
categorical (finite set) vs ordinal (ordered) vs numerical (continuous/discrete)
should consider: Blaze for high-level, multi-format API for data I/O
Imports
from __future__ importprint_function
importnumpyasnpnp.random.seed(0)
importpandasaspd
importcsvimportjsonimporth5pyimporttablesimportpickle# python3: import _pickle as cPickleimport_pickleascPickle# conda install msgpack-pythonimportmsgpack
/home/bjpcjp/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
from ._conv import register_converters as _register_converters
CSV
first: create some example CSV data (hockey player stats) & save it to disk
%%writefilech18-playerstats-2013-2014.csv# 2013-2014 / Regular Season / All Skaters / Summary / PointsRank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO%1,SidneyCrosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.52,RyanGetzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.03,ClaudeGiroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.94,TylerSeguin,DAL,C,80,37,47,84,+16,18,11,25,0,0,8,0,294,12.6,19:20,23.4,41.55,CoreyPerry,ANA,R,81,43,39,82,+32,65,8,18,0,0,9,1,280,15.4,19:28,23.2,36.0
Overwriting ch18-playerstats-2013-2014.csv
%%writefilech18-playerstats-2013-2014-top30.csv# 2013-2014 / Regular Season / All Skaters / Summary / PointsRank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO%1,SidneyCrosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.52,RyanGetzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.03,ClaudeGiroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.94,TylerSeguin,DAL,C,80,37,47,84,+16,18,11,25,0,0,8,0,294,12.6,19:20,23.4,41.55,CoreyPerry,ANA,R,81,43,39,82,+32,65,8,18,0,0,9,1,280,15.4,19:28,23.2,36.06,PhilKessel,TOR,R,82,37,43,80,-5,27,8,20,0,0,6,0,305,12.1,20:39,24.5,14.37,TaylorHall,EDM,L,75,27,53,80,-15,44,7,17,0,1,1,1,250,10.8,20:00,25.4,45.78,AlexOvechkin,WSH,L,78,51,28,79,-35,48,24,39,0,1,10,3,386,13.2,20:32,21.8,66.79,JoePavelski,SJS,C,82,41,38,79,+23,32,16,31,1,2,3,0,225,18.2,19:51,27.1,56.010,JamieBenn,DAL,L,81,34,45,79,+21,64,5,19,1,3,3,1,279,12.2,19:09,25.0,52.811,NicklasBackstrom,WSH,C,82,18,61,79,-20,54,6,44,1,1,1,0,196,9.2,19:48,23.3,50.412,PatrickSharp,CHI,L,82,34,44,78,+13,40,10,25,0,0,3,1,313,10.9,18:53,22.7,54.613,JoeThornton,SJS,C,82,11,65,76,+20,32,2,19,0,1,3,1,122,9.0,18:55,26.3,56.114,ErikKarlsson,OTT,D,82,20,54,74,-15,36,5,31,0,0,1,0,257,7.8,27:04,28.6,0.015,EvgeniMalkin,PIT,C,60,23,49,72,+10,62,7,30,0,0,3,0,191,12.0,20:03,21.4,48.816,PatrickMarleau,SJS,L,82,33,37,70,+0,18,11,23,2,2,4,0,285,11.6,20:31,27.3,52.917,AnzeKopitar,LAK,C,82,29,41,70,+34,24,10,23,0,0,9,2,200,14.5,20:53,25.4,53.318,MattDuchene,COL,C,71,23,47,70,+8,19,5,17,0,0,6,1,217,10.6,18:29,22.0,50.319,MartinSt. Louis,"TBL, NYR",R,81,30,39,69,+13,10,9,21,1,2,5,1,204,14.7,20:56,25.7,40.720,PatrickKane,CHI,R,69,29,40,69,+7,22,10,25,0,0,6,0,227,12.8,19:36,22.9,50.021,BlakeWheeler,WPG,R,82,28,41,69,+4,63,8,19,0,0,4,2,225,12.4,18:41,24.0,37.522,KyleOkposo,NYI,R,71,27,42,69,-9,51,5,15,0,0,4,1,195,13.8,20:26,22.2,47.523,DavidKrejci,BOS,C,80,19,50,69,+39,28,3,19,0,0,6,1,169,11.2,19:07,21.3,51.224,ChrisKunitz,PIT,L,78,35,33,68,+25,66,13,22,0,0,8,0,218,16.1,19:09,22.2,75.025,JonathanToews,CHI,C,76,28,40,68,+26,34,5,15,3,5,5,0,193,14.5,20:28,25.9,57.226,ThomasVanek,"BUF, NYI, MTL",L,78,27,41,68,+7,46,8,18,0,0,4,0,248,10.9,19:21,21.6,43.527,JaromirJagr,NJD,R,82,24,43,67,+16,46,5,17,0,0,6,1,231,10.4,19:09,22.8,0.028,JohnTavares,NYI,C,59,24,42,66,-6,40,8,25,0,0,4,0,188,12.8,21:14,22.3,49.129,JasonSpezza,OTT,C,75,23,43,66,-26,46,9,22,0,0,5,0,223,10.3,18:12,23.8,54.030,JordanEberle,EDM,R,80,28,37,65,-11,18,7,20,1,1,4,1,200,14.0,19:32,25.4,38.1
Overwriting ch18-playerstats-2013-2014-top30.csv
# let's see if file contents are as expected
!head-n5ch18-playerstats-2013-2014-top30.csv
# 2013-2014 / Regular Season / All Skaters / Summary / Points
Rank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO%
1,Sidney Crosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.5
2,Ryan Getzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.0
3,Claude Giroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.9
Parsed row values will be read as strings, even if values represent numbers.
Numpy loadtxt and savetxt are good for handling numerical arrays on disk.
hierarchical format - orgs datasets within files: "groups" and "datasets"
groups & datasets can contain "attributes" (metadata)
Python libraries: h5py & PyTables
importh5py
file modes: "w" (create new file; truncate if exists), "r" (read-only; file must exist), "w-" (create new file; error if exists), "r+" (read-write; file must exist), "a" (read-write; create if needed)
# create new read-write filef=h5py.File("ch18-data.h5", "w")
f.mode
'r+'
f.flush()
f.close()
Groups
File object creates both file handle and a "root group" object.
# create new PyTables HDF5 filef=tables.open_file(
"ch18-playerstats-2013-2014.h5", mode="w")
# create HDF5 groupsgrp=f.create_group(
"/",
"season_2013_2014",
title="NHL player statistics for the 2013/2014 season")
grp
/season_2013_2014 (Group) 'NHL player statistics for the 2013/2014 season'
children := []
# Unlike h5py, PyTables file objects do not represent root groups in the HDF5 file.# Use the root attribute to access it instead.f.root
/ (RootGroup) ''
children := ['season_2013_2014' (Group)]
# PyTables makes it easy to create mixed column types.classPlayerStat(tables.IsDescription):
player=tables.StringCol(20, dflt="")
position=tables.StringCol(1, dflt="C")
games_played=tables.UInt8Col(dflt=0)
points=tables.UInt16Col(dflt=0)
goals=tables.UInt16Col(dflt=0)
assists=tables.UInt16Col(dflt=0)
shooting_percentage=tables.Float64Col(dflt=0.0)
shifts_per_game_played=tables.Float64Col(dflt=0.0)
top30_table=f.create_table(
grp, 'top30', PlayerStat, "Top 30 point leaders")
playerstat=top30_table.rowtype(playerstat)
tables.tableextension.Row
# to insert data into table, use row attribute of table object# when row object is initialized, use append to insert data.forindex, row_seriesindf.iterrows():
playerstat["player"] =row_series["Player"]
playerstat["position"] =row_series["Pos"]
playerstat["games_played"] =row_series["GP"]
playerstat["points"] =row_series["P"]
playerstat["goals"] =row_series["G"]
playerstat["assists"] =row_series["A"]
playerstat["shooting_percentage"] =row_series["S%"]
playerstat["shifts_per_game_played"] =row_series["Shift/GP"]
playerstat.append()
# flush forces a file writetop30_table.flush()
# access table data using cols attributetop30_table.cols.player[:5]
# Use iterrows to create an iterator for row-wise data access.defprint_playerstat(row):
print("%20s\t%s\t%s\t%s"%
(row["player"].decode('UTF-8'), row["points"], row["goals"], row["assists"]))
df1 <HDF5 group "/df1" (4 members)>
df1/axis0 <HDF5 dataset "axis0": shape (5,), type "<i8">
df1/axis1 <HDF5 dataset "axis1": shape (5,), type "<i8">
df1/block0_items <HDF5 dataset "block0_items": shape (5,), type "<i8">
df1/block0_values <HDF5 dataset "block0_values": shape (5, 5), type "<f8">
df2 <HDF5 group "/df2" (8 members)>
df2/axis0 <HDF5 dataset "axis0": shape (21,), type "|S8">
df2/axis1 <HDF5 dataset "axis1": shape (30,), type "<i8">
df2/block0_items <HDF5 dataset "block0_items": shape (3,), type "|S8">
df2/block0_values <HDF5 dataset "block0_values": shape (30, 3), type "<f8">
df2/block1_items <HDF5 dataset "block1_items": shape (14,), type "|S4">
df2/block1_values <HDF5 dataset "block1_values": shape (30, 14), type "<i8">
df2/block2_items <HDF5 dataset "block2_items": shape (4,), type "|S6">
df2/block2_values <HDF5 dataset "block2_values": shape (1,), type "|O">
# HDF5Store objects store dataframes in distinct groups.# Each dataframe is split into heterogeneous "blocks" with columns grouped by data type# Column names & values are stored in separate HDF5 datasets.f["/df2/block0_items"].value
# another complex data structure, saved to a JSON filedata= {"one": [1],
"two": {"one": 1, "two": 2},
"three": [(1,), (1, 2), (1, 2, 3)],
"four": "a text string"}
# now we can iterate & filter items with Python list syntax# below: select connected nodes in graph, on C line, with travel time = 1 minute.
[(s, e, tt) fors, e, ttindata["C"]["travel_times"] iftt==1]