import os
import pandas as pd
import mapreduce as mr
import csv

!ls ./trips

ltrips_short_exp123_wkdy_final_noXYcoord.txt
ltrips_short_exp13_sat_final_noXYcoord.txt
ltrips_short_exp13_sun_final_noXYcoord.txt

Trip data

relevant page

Since this data is based on survey, MTA applied and validated weight. The description can be found in the report (pg. 46)
But not fully understood! I just tried this.
Report

week = pd.read_csv('./trips/ltrips_short_exp123_wkdy_final_noXYcoord.txt', dtype=str)

print('The number of total respondents: {}'.format(len(week)))

The number of total respondents: 49167

[(i,j) for i,j in zip(week.columns, range(len(week.columns)))]

[('sampn', 0),
 ('perno', 1),
 ('tripno', 2),
 ('TRIP_ID', 3),
 ('otype', 4),
 ('dtype', 5),
 ('tpur', 6),
 ('dtime', 7),
 ('atime', 8),
 ('trip_sdate', 9),
 ('trip_edate', 10),
 ('O_COUNTY_STR', 11),
 ('D_COUNTY_STR', 12),
 ('Ptype', 13),
 ('daywk', 14),
 ('trans', 15),
 ('travy', 16),
 ('ntrav', 17),
 ('o_ntrav', 18),
 ('trtot', 19),
 ('ussob', 20),
 ('new_fare', 21),
 ('metro', 22),
 ('cash', 23),
 ('tmet', 24),
 ('o_tmet', 25),
 ('tumet', 26),
 ('o_tumet', 27),
 ('ppmet', 28),
 ('fare', 29),
 ('esub1', 30),
 ('esub2', 31),
 ('esub3', 32),
 ('esub4', 33),
 ('esub5', 34),
 ('esub6', 35),
 ('esub7', 36),
 ('esub8', 37),
 ('trip_count', 38),
 ('rectype', 39),
 ('s_dat', 40),
 ('s_dur', 41),
 ('ctfip', 42),
 ('incen', 43),
 ('stype', 44),
 ('lang', 45),
 ('hhnum', 46),
 ('hhveh', 47),
 ('ltele', 48),
 ('ctele', 49),
 ('cctel', 50),
 ('hinet', 51),
 ('hmrac', 52),
 ('o_hmrac', 53),
 ('ccmin', 54),
 ('ccinc', 55),
 ('cclin', 56),
 ('hcity', 57),
 ('hstat', 58),
 ('hzip', 59),
 ('hcnty', 60),
 ('puma', 61),
 ('MODE_G2', 62),
 ('MODE_G3', 63),
 ('MODE_G5', 64),
 ('MODE_G6', 65),
 ('MODE_G8', 66),
 ('MODE_G9', 67),
 ('MODE_G10', 68),
 ('PER_DAYTYPE', 69),
 ('TT_DAYTYPE', 70),
 ('MODE1', 71),
 ('MODE2', 72),
 ('MODE3', 73),
 ('MODE4', 74),
 ('MODE5', 75),
 ('MODE6', 76),
 ('MODE7', 77),
 ('MODE8', 78),
 ('MODE9', 79),
 ('MODE10', 80),
 ('MODE11', 81),
 ('MODE12', 82),
 ('MODE13', 83),
 ('MODE14', 84),
 ('MODE15', 85),
 ('MODE16', 86),
 ('O_Boro', 87),
 ('D_Boro', 88),
 ('HH_adults', 89),
 ('HH_workers', 90),
 ('R_BORO', 91),
 ('HH_SIZE', 92),
 ('HH_INC4', 93),
 ('HH_INC7', 94),
 ('HH_ETHNIC', 95),
 ('O_PURP', 96),
 ('O_NYTTAZ', 97),
 ('O_RTFMTAZ', 98),
 ('O_TRACT', 99),
 ('O_PUMA', 100),
 ('O_COUNTY', 101),
 ('D_PURP', 102),
 ('D_NYTTAZ', 103),
 ('D_RTFMTAZ', 104),
 ('D_TRACT', 105),
 ('D_PUMA', 106),
 ('D_COUNTY', 107),
 ('O_PURP_M', 108),
 ('D_PURP_M', 109),
 ('NSUB', 110),
 ('NNYBUS', 111),
 ('NCRAIL', 112),
 ('NOTHTRAN', 113),
 ('StopAreaNo', 114),
 ('HR_DEP', 115),
 ('HR_ARR', 116),
 ('TRIP_MIN', 117),
 ('EXP1_FINAL_WKD', 118),
 ('EXP21_FINAL_WKD', 119),
 ('EXP22_FINAL_WKD', 120),
 ('EXP31_FINAL_WKD', 121),
 ('EXP32_FINAL_WKD', 122),
 ('hmsex', 123),
 ('hmage', 124),
 ('SIRTOA_FLAG', 125),
 ('O_PLANDIST', 126),
 ('D_PLANDIST', 127)]

sum(week['EXP32_FINAL_WKD'].astype(float).dropna())

14308914.969999703

week[['O_TRACT','D_TRACT']][week['O_TRACT'] == week['D_TRACT']]

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	O_TRACT	D_TRACT
13	36005017300.00	36005017300.00
34	36005017300.00	36005017300.00
38	36005005902.00	36005005902.00
47	36005005901.00	36005005901.00
48	36005005901.00	36005005901.00
49	36005005901.00	36005005901.00
54	36005005901.00	36005005901.00
62	36005006500.00	36005006500.00
63	36005006500.00	36005006500.00
64	36005006500.00	36005006500.00
65	36005006500.00	36005006500.00
67	36061016400.00	36061016400.00
68	36061016400.00	36061016400.00
73	36061016400.00	36061016400.00
74	36061016400.00	36061016400.00
86	36005004700.00	36005004700.00
99	36005004700.00	36005004700.00
100	36005004700.00	36005004700.00
116	36005019900.00	36005019900.00
117	36005019900.00	36005019900.00
158	36005006500.00	36005006500.00
180	36005025500.00	36005025500.00
192	36005023702.00	36005023702.00
193	36005023702.00	36005023702.00
194	36005023702.00	36005023702.00
195	36005023702.00	36005023702.00
209	36061030300.00	36061030300.00
210	36061030300.00	36061030300.00
234	36005005302.00	36005005302.00
235	36005005302.00	36005005302.00
...	...	...
48349	36047054700.00	36047054700.00
48350	36047054700.00	36047054700.00
48353	36047054700.00	36047054700.00
48365	36047052900.00	36047052900.00
48366	36047052900.00	36047052900.00
48382	36047055100.00	36047055100.00
48401	36047089400.00	36047089400.00
48402	36047089400.00	36047089400.00
48411	36047114201.00	36047114201.00
48492	36061000700.00	36061000700.00
48493	36061000700.00	36061000700.00
48528	36047042900.00	36047042900.00
48529	36047042900.00	36047042900.00
48539	36047042900.00	36047042900.00
48540	36047042900.00	36047042900.00
48615	34037373900.00	34037373900.00
48618	36061014200.00	36061014200.00
48619	36061014200.00	36061014200.00
48759	36061027100.00	36061027100.00
48796	36061024500.00	36061024500.00
48797	36061024500.00	36061024500.00
48803	36061027100.00	36061027100.00
48836	36061009800.00	36061009800.00
48837	36061009800.00	36061009800.00
48861	36061025100.00	36061025100.00
49013	36061000800.00	36061000800.00
49014	36061000800.00	36061000800.00
49140	36061026900.00	36061026900.00
49141	36061026900.00	36061026900.00
49149	36061027900.00	36061027900.00

3205 rows × 2 columns

3205 trips has same origin and destination which is 6% of the total respondents.

week_dif_OD = week[week['O_TRACT'] != week['D_TRACT']]

len(week_dif_OD)

# With OD same rows

reader = enumerate([(tract[:-3],1) for tract in week['O_TRACT'].astype(str)])
def mapper(_,o):
    yield(o)
def reducer(o,count):
    yield(o,sum(count))
o_tracts = list(mr.run(reader,mapper,reducer))

reader = enumerate([(tract[:-3],1) for tract in week['D_TRACT'].astype(str)])
def mapper(_,o):
    yield(o)
def reducer(o,count):
    yield(o,sum(count))
d_tracts = list(mr.run(reader,mapper,reducer))

# Without OD same rows

reader = enumerate([(tract[:-3],1) for tract in week_dif_OD['O_TRACT'].astype(str)])
def mapper(_,o):
    yield(o)
def reducer(o,count):
    yield(o,sum(count))
o_tracts = list(mr.run(reader,mapper,reducer))

reader = enumerate([(tract[:-3],1) for tract in week_dif_OD['D_TRACT'].astype(str)])
def mapper(_,o):
    yield(o)
def reducer(o,count):
    yield(o,sum(count))
d_tracts = list(mr.run(reader,mapper,reducer))

# with open('eggs.csv', 'w', newline='') as csvfile:
#     spamwriter = csv.writer(csvfile, delimiter=' ',
#                             quotechar='|', quoting=csv.QUOTE_MINIMAL)
#     for i in o_tracts:
#         spamwriter.writerow(i)

# with open('spam.csv', 'w', newline='') as csvfile:
#     spamwriter = csv.writer(csvfile, delimiter=' ',
#                             quotechar='|', quoting=csv.QUOTE_MINIMAL)
#     for i in d_tracts:
#         spamwriter.writerow(i)

trip_o = pd.DataFrame(o_tracts, columns=['tracts','trip_o'])
trip_d = pd.DataFrame(d_tracts, columns=['tracts','trip_d'])

display(trip_o.head(), trip_d.head())

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	tracts	trip_o
0	11001005900	2
1	11001008600	1
2	19061000100	1
3	25019950100	1
4	25019950400	1

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	tracts	trip_d
0		1
1	11001008600	1
2	25019950400	1
3	25027759100	4
4	34001002400	5

Census Tracts

Census Tract Code Format

census tract code
state + county + tract Counties in NYC
county code in NYC
State Code of NYC : 36
Manhattan - New York County (061)
Bronx - Bronx County (005)
Brooklyn - Kings County (047)
Queens - Queens County (081)
Staten Island - Richmond County (085)

ct2000 = pd.read_csv('nyct2000wi.csv', dtype=str)

ct2000.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	BoroCode	BoroName	CTLabel	the_geom	CT2000	BoroCT2000	CDEligibil	NTACode	NTANAme	PUMA	Shape_Leng	Shape_Area
0	1	Manhattan	98	MULTIPOLYGON (((-73.96432543478758 40.75638153...	009800	1009800	I	MN19	Turtle Bay-East Midtown	3808	5534.20005976	1906016.374
1	1	Manhattan	100	MULTIPOLYGON (((-73.96802436915851 40.75957814...	010000	1010000	I	MN19	Turtle Bay-East Midtown	3808	5692.16866575	1860938.35958
2	1	Manhattan	190	MULTIPOLYGON (((-73.94505127984516 40.80259859...	019000	1019000	E	MN11	Central Harlem South	3803	4231.8265884	1117371.65978
3	1	Manhattan	206	MULTIPOLYGON (((-73.93580780201182 40.80949763...	020600	1020600	E	MN03	Central Harlem North-Polo Grounds	3803	5176.87315098	1602693.82096
4	1	Manhattan	217.02	MULTIPOLYGON (((-73.9470345118215 40.815829904...	021702	1021702	E	MN03	Central Harlem North-Polo Grounds	3803	3338.290909	446993.881638

ct2000.BoroName.unique()

array(['Manhattan', 'Bronx', 'Queens', 'Brooklyn', 'Staten Island'],
      dtype=object)

ct2000['cdct'][ct2000['BoroName']=='Manhattan'] = '36061' + ct2000['CT2000']
ct2000['cdct'][ct2000['BoroName']=='Bronx'] = '36005' + ct2000['CT2000']
ct2000['cdct'][ct2000['BoroName']=='Brooklyn'] = '36047' + ct2000['CT2000']
ct2000['cdct'][ct2000['BoroName']=='Queens'] = '36081' + ct2000['CT2000']
ct2000['cdct'][ct2000['BoroName']=='Staten Island'] = '36085' + ct2000['CT2000']

ct2000[:2]

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	BoroCode	BoroName	CTLabel	the_geom	CT2000	BoroCT2000	CDEligibil	NTACode	NTANAme	PUMA	Shape_Leng	Shape_Area	cdct
0	1	Manhattan	98	MULTIPOLYGON (((-73.96432543478758 40.75638153...	009800	1009800	I	MN19	Turtle Bay-East Midtown	3808	5534.20005976	1906016.374	36061009800
1	1	Manhattan	100	MULTIPOLYGON (((-73.96802436915851 40.75957814...	010000	1010000	I	MN19	Turtle Bay-East Midtown	3808	5692.16866575	1860938.35958	36061010000

print(ct2000.cdct.dtype, trip_o.tracts.dtype, trip_d.tracts.dtype )
print(trip_o.trip_o.dtype, trip_d.trip_d.dtype)

object object object
int64 int64

mrg = pd.merge(ct2000, trip_o, how='inner', left_on='cdct', right_on='tracts')
mrg = pd.merge(mrg, trip_d, how='inner', left_on='cdct', right_on='tracts')

print(len(mrg))
display(mrg.head())

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	BoroCode	BoroName	CTLabel	the_geom	CT2000	BoroCT2000	CDEligibil	NTACode	NTANAme	PUMA	Shape_Leng	Shape_Area	cdct	tracts_x	trip_o	tracts_y	trip_d
0	1	Manhattan	98	MULTIPOLYGON (((-73.96432543478758 40.75638153...	009800	1009800	I	MN19	Turtle Bay-East Midtown	3808	5534.20005976	1906016.374	36061009800	36061009800	72	36061009800	74
1	1	Manhattan	100	MULTIPOLYGON (((-73.96802436915851 40.75957814...	010000	1010000	I	MN19	Turtle Bay-East Midtown	3808	5692.16866575	1860938.35958	36061010000	36061010000	145	36061010000	140
2	1	Manhattan	190	MULTIPOLYGON (((-73.94505127984516 40.80259859...	019000	1019000	E	MN11	Central Harlem South	3803	4231.8265884	1117371.65978	36061019000	36061019000	36	36061019000	33
3	1	Manhattan	206	MULTIPOLYGON (((-73.93580780201182 40.80949763...	020600	1020600	E	MN03	Central Harlem North-Polo Grounds	3803	5176.87315098	1602693.82096	36061020600	36061020600	9	36061020600	9
4	1	Manhattan	217.02	MULTIPOLYGON (((-73.9470345118215 40.815829904...	021702	1021702	E	MN03	Central Harlem North-Polo Grounds	3803	3338.290909	446993.881638	36061021702	36061021702	13	36061021702	13

mrg.to_csv('od_trip_ct2008.csv')

aggregating with population data

!wget -O pop_ct.csv https://data.cityofnewyork.us/api/views/kc6e-jm93/rows.csv?accessType=DOWNLOAD

--2019-02-27 08:48:28--  https://data.cityofnewyork.us/api/views/kc6e-jm93/rows.csv?accessType=DOWNLOAD
Resolving data.cityofnewyork.us (data.cityofnewyork.us)... 52.206.68.26, 52.206.140.199, 52.206.140.205
Connecting to data.cityofnewyork.us (data.cityofnewyork.us)|52.206.68.26|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/csv]
Saving to: ‘pop_ct.csv’

pop_ct.csv              [ <=>                ] 132.31K  --.-KB/s    in 0.002s  

Last-modified header invalid -- time-stamp ignored.
2019-02-27 08:48:29 (57.7 MB/s) - ‘pop_ct.csv’ saved [135483]

pop2000 = pd.read_csv('pop_ct.csv', dtype=str)
pop2000 = pop2000[pop2000['Year'] == '2000']

pop2000.Year.unique()

array(['2000'], dtype=object)

pop2000.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	Borough	Year	FIPS County Code	DCP Borough Code	Census Tract	Population
0	Bronx	2000	005	2	000100	12780
1	Bronx	2000	005	2	000200	3545
2	Bronx	2000	005	2	000400	3314
3	Bronx	2000	005	2	001600	5237
4	Bronx	2000	005	2	001900	1584

pop2000['cdct'] = '36' + pop2000['FIPS County Code'] + pop2000['Census Tract']

pop2000.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	Borough	Year	FIPS County Code	DCP Borough Code	Census Tract	Population	cdct
0	Bronx	2000	005	2	000100	12780	36005000100
1	Bronx	2000	005	2	000200	3545	36005000200
2	Bronx	2000	005	2	000400	3314	36005000400
3	Bronx	2000	005	2	001600	5237	36005001600
4	Bronx	2000	005	2	001900	1584	36005001900

mrg = pd.merge(mrg, pop2000, how='inner', on='cdct')

mrg.columns

Index(['BoroCode', 'BoroName', 'CTLabel', 'the_geom', 'CT2000', 'BoroCT2000',
       'CDEligibil', 'NTACode', 'NTANAme', 'PUMA', 'Shape_Leng', 'Shape_Area',
       'cdct', 'tracts_x', 'trip_o', 'tracts_y', 'trip_d', 'Borough', 'Year',
       'FIPS County Code', 'DCP Borough Code', 'Census Tract', 'Population'],
      dtype='object')

mrg = mrg[['BoroName', 'the_geom', 'cdct', 'trip_o', 'trip_d', 'Year', 'Population']]

mrg.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	BoroName	the_geom	cdct	trip_o	trip_d	Year	Population
0	Manhattan	MULTIPOLYGON (((-73.96432543478758 40.75638153...	36061009800	72	74	2000	7066
1	Manhattan	MULTIPOLYGON (((-73.96802436915851 40.75957814...	36061010000	145	140	2000	1822
2	Manhattan	MULTIPOLYGON (((-73.94505127984516 40.80259859...	36061019000	36	33	2000	1818
3	Manhattan	MULTIPOLYGON (((-73.93580780201182 40.80949763...	36061020600	9	9	2000	2310
4	Manhattan	MULTIPOLYGON (((-73.93669078722161 40.83719324...	36061024900	6	6	2000	1150

mrg['trip_o'] - mrg['trip_d']

0       -2
1        5
2        3
3        0
4        0
5        6
6       -6
7        0
8        1
9        3
10       1
11       3
12       2
13     -11
14       2
15      -1
16      -6
17       1
18       1
19      -1
20       0
21       0
22       0
23       0
24       0
25       0
26       0
27       0
28       0
29       0
        ..
1821     0
1822    -2
1823     1
1824    -1
1825     0
1826     0
1827    -7
1828     0
1829     1
1830     0
1831     0
1832     0
1833     0
1834    -5
1835    -1
1836     0
1837     0
1838     6
1839    -1
1840    -1
1841     1
1842     0
1843     0
1844     1
1845    -1
1846     0
1847     0
1848     0
1849     0
1850     0
Length: 1851, dtype: int64

mrg.to_csv('trip_with_pop.csv', index=False)

Figure 0. Origin Data

Figure 0. Destination data (Quantile)

sucream1004/capstone2019

Trip data

Census Tracts

Census Tract Code Format

aggregating with population data