conda install dask


conda install dask -c conda-forge


conda install dask


# import pandas as pd
import dask.dataframe as dd


df = dd.read_csv('./data/yellow_tripdata_2019-01.csv')


df.head()


%%time
data_list = ['./yellow_taxi_data/yellow_tripdata_2019-01.csv',
           './yellow_taxi_data/yellow_tripdata_2019-02.csv',
           './yellow_taxi_data/yellow_tripdata_2019-03.csv']
ddf1 = dd.read_csv(data_list)

Wall time: 23.8 ms


ddf2 = dd.read_csv(['./yellow_taxi_data/yellow_tripdata_2019-01.csv',
                 './yellow_taxi_data/yellow_tripdata_2019-02.csv',
                 './yellow_taxi_data/yellow_tripdata_2019-03.csv'])


df_full = dd.read_csv('./yellow_taxi_data/*.csv')


%%time
ddf3 = dd.read_csv(['./data/yellow_tripdata_2019-*.csv',
                  './data/yellow_tripdata_2020-*.csv'])

Wall time: 19 ms


df_block = dd.read_csv('./data/green_tripdata_2019-01-03m.csv', blocksize=25e6)
df_block.head()


url = 'https://raw.githubusercontent.com/shoukewei/data/main/data-1dwt/930-data-export.csv'
ddf4 = dd.read_csv(url)
ddf4.head()


ddf_p = dd.read_parquet('./data/green_tripdata_2022-11.parquet')
ddf_p.head()


ddf_p2 = dd.read_parquet('./data/green_tripdata_2022-11.parquet', 
                         columns=['VendorID', 'trip_distance'])
ddf_p2.head()


ddf_c = dd.read_hdf('./data/yellow_tripdata_2019-01.hdf5','/hdfdata')
ddf_c.head()


ddf2.to_csv('./data/green_tripdata_2019-01m-03m.csv',index=False)

['D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\00.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\01.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\02.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\03.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\04.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\05.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\06.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\07.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\08.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\09.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\10.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\11.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\12.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\13.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\14.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\15.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\16.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\17.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\18.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\19.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\20.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\21.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\22.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\23.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\24.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\25.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\26.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\27.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\28.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\29.part',
 'D:\\big_data\\data\\green_tripdata_2019-01m-03m.csv\\30.part']


ddf2.to_csv('./data/green_tripdata_2019-01-03m.csv',index=False,single_file=True)

['D:\\big_data\\data\\green_tripdata_2019-01-03m.csv']


ddf2.to_parquet('./data/yellow_tripdata_2019-01.parquet')


ddf2.to_hdf('./data/yellow_tripdata_2019-01.hdf5','/hdfdata')

['./data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5',
 './data/yellow_tripdata_2019-01.hdf5']

	VendorID	tpep_pickup_datetime	tpep_dropoff_datetime	passenger_count	trip_distance	RatecodeID	store_and_fwd_flag	PULocationID	DOLocationID	payment_type	fare_amount	extra	mta_tax	tip_amount	improvement_surcharge	total_amount	congestion_surcharge
0	1	2019-01-01 00:46:40	2019-01-01 00:53:20	1	1.5	1	N	151	239	1	7.0	0.5	0.5	1.65	0.3	9.95	NaN
1	1	2019-01-01 00:59:47	2019-01-01 01:18:59	1	2.6	1	N	239	246	1	14.0	0.5	0.5	1.00	0.3	16.30	NaN
2	2	2018-12-21 13:48:30	2018-12-21 13:52:40	3	0.0	1	N	236	236	1	4.5	0.5	0.5	0.00	0.3	5.80	NaN
3	2	2018-11-28 15:52:25	2018-11-28 15:55:45	5	0.0	1	N	193	193	2	3.5	0.5	0.5	0.00	0.3	7.55	NaN
4	2	2018-11-28 15:56:57	2018-11-28 15:58:33	5	0.0	2	N	193	193	2	52.0	0.0	0.5	0.00	0.3	55.55	NaN

	VendorID	tpep_pickup_datetime	tpep_dropoff_datetime	passenger_count	trip_distance	RatecodeID	store_and_fwd_flag	PULocationID	DOLocationID	payment_type	fare_amount	extra	mta_tax	tip_amount	improvement_surcharge	total_amount	congestion_surcharge
0	1	2019-01-01 00:46:40	2019-01-01 00:53:20	1	1.5	1	N	151	239	1	7.0	0.5	0.5	1.65	0.3	9.95	NaN
1	1	2019-01-01 00:59:47	2019-01-01 01:18:59	1	2.6	1	N	239	246	1	14.0	0.5	0.5	1.00	0.3	16.30	NaN
2	2	2018-12-21 13:48:30	2018-12-21 13:52:40	3	0.0	1	N	236	236	1	4.5	0.5	0.5	0.00	0.3	5.80	NaN
3	2	2018-11-28 15:52:25	2018-11-28 15:55:45	5	0.0	1	N	193	193	2	3.5	0.5	0.5	0.00	0.3	7.55	NaN
4	2	2018-11-28 15:56:57	2018-11-28 15:58:33	5	0.0	2	N	193	193	2	52.0	0.0	0.5	0.00	0.3	55.55	NaN

	Region Code	Timestamp (Hour Ending)	CAL Demand (MWh)	CAR Demand (MWh)	CENT Demand (MWh)	FLA Demand (MWh)	MIDA Demand (MWh)	MIDW Demand (MWh)	NW Demand (MWh)	SE Demand (MWh)	SW Demand (MWh)
0	US48	8/9/2022 12 a.m. EDT	47119	29809	38993	31602	112854	91266	52342	29482	18988
1	US48	8/9/2022 1 a.m. EDT	45732	27658	36487	29235	107920	86230	49108	27310	17922
2	US48	8/9/2022 2 a.m. EDT	43457	26187	34575	27383	103539	82753	45233	26329	16545
3	US48	8/9/2022 3 a.m. EDT	40527	25172	33135	26180	99702	79285	41682	25426	15300
4	US48	8/9/2022 4 a.m. EDT	37771	24397	32155	25423	97137	77026	39198	24759	14397

	VendorID	lpep_pickup_datetime	lpep_dropoff_datetime	store_and_fwd_flag	RatecodeID	PULocationID	DOLocationID	passenger_count	trip_distance	fare_amount	extra	mta_tax	tip_amount	ehail_fee	improvement_surcharge	total_amount	payment_type	trip_type	congestion_surcharge
0	2	2022-11-01 00:28:37	2022-11-01 00:31:56	N	1.0	223	223	1.0	0.71	4.5	0.5	0.5	1.45	None	0.3	7.25	1.0	1.0	0.00
1	2	2022-11-01 00:51:02	2022-11-01 01:12:50	N	5.0	80	90	2.0	6.86	45.0	0.0	0.0	9.61	None	0.3	57.66	1.0	2.0	2.75
2	2	2022-11-01 00:51:50	2022-11-01 00:55:38	N	1.0	244	244	2.0	0.58	4.5	0.5	0.5	0.00	None	0.3	5.80	2.0	1.0	0.00
3	2	2022-11-01 00:03:32	2022-11-01 00:12:28	N	1.0	116	74	1.0	2.74	10.5	0.5	0.5	0.00	None	0.3	11.80	2.0	1.0	0.00
4	2	2022-11-01 00:17:46	2022-11-01 00:22:03	N	1.0	134	134	1.0	0.91	5.0	0.5	0.5	1.58	None	0.3	7.88	1.0	1.0	0.00

	VendorID	trip_distance
0	2	0.71
1	2	6.86
2	2	0.58
3	2	2.74
4	2	0.91

Table of Contents

1. Why Dask¶

2. Installation of Dask¶

2.1 conda¶

2.2 pip¶

3. Dask DataFrames¶

3.1 Data types¶

3.2 Dask DataFrames¶

4. Create DataFrames¶

4.1 Read `.csv` Files¶

(1) Read a single `.csv` file¶

(2) Read multiple `.csv` files¶

(3) Read all data files in certain type in a folder¶

(4) Read filtered data files¶

(5) Read large files¶

(6) Read `csv` online¶

4.2 Read Parquet Files¶

4.3 Read HDF files¶

5. Store the DateFrame¶

5.1 Save DataFrame into CSV¶

5.2 Save DataFrame into Parquet files¶

5.3 Save DataFrame into HDF files¶

Summary¶

Table of Contents

1. Why Dask¶

2. Installation of Dask¶

2.1 conda¶

2.2 pip¶

3. Dask DataFrames¶

3.1 Data types¶

3.2 Dask DataFrames¶

4. Create DataFrames¶

4.1 Read .csv Files¶

(1) Read a single .csv file¶

(2) Read multiple .csv files¶

(3) Read all data files in certain type in a folder¶

(4) Read filtered data files¶

(5) Read large files¶

(6) Read csv online¶

4.2 Read Parquet Files¶

4.3 Read HDF files¶

5. Store the DateFrame¶

5.1 Save DataFrame into CSV¶

5.2 Save DataFrame into Parquet files¶

5.3 Save DataFrame into HDF files¶

Summary¶

4.1 Read `.csv` Files¶

(1) Read a single `.csv` file¶

(2) Read multiple `.csv` files¶

(6) Read `csv` online¶