import vaex
vaex.__version__

{'vaex': '4.16.0',
 'vaex-core': '4.16.1',
 'vaex-viz': '0.5.4',
 'vaex-hdf5': '0.14.1',
 'vaex-server': '0.8.1',
 'vaex-astro': '0.9.3',
 'vaex-jupyter': '0.8.1',
 'vaex-ml': '0.18.1'}


pip install vaex


import vaex

# read csv with vaex.open()
vdf = vaex.open('./data/yellow_tripdata_2019-01.csv')
vdf.head()


vdf2 = vaex.from_csv('./data/yellow_tripdata_2019-01.csv')
vdf2.head()


vdf3 = vaex.read_csv('./data/yellow_tripdata_2019-01.csv')

# display the first 5 rows
vdf3.head(5)


df_by_arrow = vaex.from_csv_arrow('./data/yellow_tripdata_2019-01.csv')

# display the first 3 rows
df_by_arrow.head(3)


df_chunk = vaex.from_csv('./data/yellow_tripdata_2019-01.csv', convert=True, chunk_size=5_000_000)
df_chunk.head(3)


url = 'https://raw.githubusercontent.com/Malekai/Downloading-Data/master/population_data.json'
df_json = vaex.from_json(url)

df_json.head(3)


# Read a Parquet file with Vaex
df_parq = vaex.open('./data/yellow_tripdata_2022-01.parquet')
df_parq.head(5)


# Read a Parquet file with Vaex
df_hdf = vaex.open('./data/sample_names_1.hdf5')
df_hdf


df_s3 = vaex.open('s3://vaex/taxi/nyc_taxi_2015_mini.hdf5?anon=true')
df_s3.head(3)


df_all = vaex.open('./data/*.parquet')
df_all.head(5)


df_all.shape

(31590956, 19)


file_list = ['./data/yellow_tripdata_2022-01.parquet',
             './data/yellow_tripdata_2022-02.parquet']

df_list = vaex.open_many(file_list)
df_list.head(5)


import pandas as pd

pd_df = pd.read_csv('https://raw.githubusercontent.com/shoukewei/data/main/data-pydm/top_six_economies.csv')
pd_df.head()


vx_df = vaex.from_pandas(df=pd_df, copy_index=True)
vx_df.head(5)


import pyarrow.csv

arrow_table = pyarrow.csv.read_csv('./data/yellow_tripdata_2019-01.csv')
arrow_table

pyarrow.Table
VendorID: int64
tpep_pickup_datetime: timestamp[s]
tpep_dropoff_datetime: timestamp[s]
passenger_count: int64
trip_distance: double
RatecodeID: int64
store_and_fwd_flag: string
PULocationID: int64
DOLocationID: int64
payment_type: int64
fare_amount: double
extra: double
mta_tax: double
tip_amount: double
tolls_amount: double
improvement_surcharge: double
total_amount: double
congestion_surcharge: double
----
VendorID: [[1,1,2,2,2,...,1,1,1,1,1],[1,1,2,2,2,...,1,1,1,2,2],...,[2,2,2,2,2,...,2,2,2,2,2],[2,2,2,2,2,...,2,2,2,2,2]]
tpep_pickup_datetime: [[2019-01-01 00:46:40,2019-01-01 00:59:47,2018-12-21 13:48:30,2018-11-28 15:52:25,2018-11-28 15:56:57,...,2019-01-01 00:49:39,2019-01-01 00:17:00,2019-01-01 00:38:22,2019-01-01 00:34:42,2019-01-01 00:18:59],[2019-01-01 00:28:05,2019-01-01 00:37:25,2019-01-01 00:11:49,2019-01-01 00:48:52,2019-01-01 00:25:23,...,2019-01-01 01:55:27,2019-01-01 01:38:49,2019-01-01 01:58:30,2019-01-01 01:00:53,2019-01-01 01:17:20],...,[2019-01-31 22:01:43,2019-01-31 22:21:26,2019-01-31 22:33:50,2019-01-31 22:52:26,2019-01-31 22:07:42,...,2019-01-31 23:10:38,2019-01-31 23:05:18,2019-01-31 23:13:49,2019-01-31 23:16:20,2019-01-31 23:39:43],[2019-01-31 23:50:45,2019-01-31 23:03:33,2019-01-31 23:19:27,2019-01-31 23:47:18,2019-01-31 23:08:38,...,2019-01-31 23:57:36,2019-01-31 23:32:03,2019-01-31 23:36:36,2019-01-31 23:14:53,2019-01-31 23:12:49]]
tpep_dropoff_datetime: [[2019-01-01 00:53:20,2019-01-01 01:18:59,2018-12-21 13:52:40,2018-11-28 15:55:45,2018-11-28 15:58:33,...,2019-01-01 01:10:24,2019-01-01 00:36:19,2019-01-01 01:05:05,2019-01-01 01:04:57,2019-01-01 00:25:25],[2019-01-01 00:35:01,2019-01-01 00:53:28,2019-01-01 00:29:58,2019-01-01 01:11:01,2019-01-01 00:37:18,...,2019-01-01 02:14:19,2019-01-01 01:47:41,2019-01-01 02:15:04,2019-01-01 01:14:59,2019-01-01 01:48:34],...,[2019-01-31 22:17:05,2019-01-31 22:27:54,2019-01-31 22:41:28,2019-01-31 23:03:29,2019-01-31 22:10:35,...,2019-01-31 23:17:07,2019-01-31 23:09:56,2019-01-31 23:22:46,2019-01-31 23:26:11,2019-01-31 23:42:09],[2019-02-01 00:01:17,2019-01-31 23:35:51,2019-01-31 23:36:55,2019-01-31 23:53:44,2019-01-31 23:16:52,...,2019-02-01 00:18:39,2019-01-31 23:33:11,2019-01-31 23:36:40,2019-01-31 23:15:20,2019-01-31 23:14:08]]
passenger_count: [[1,1,3,5,5,...,2,1,1,1,1],[1,1,1,1,1,...,2,1,1,1,1],...,[1,1,1,1,2,...,1,1,1,1,1],[1,1,3,2,5,...,1,1,1,1,1]]
trip_distance: [[1.5,2.6,0,0,0,...,4.6,4.1,5.5,3.7,1.2],[1,6,2.75,5.46,1.84,...,4.8,1.4,4.3,1.84,7.47],...,[1.22,0.64,2.62,1.13,0.63,...,0.6,0.99,1.51,1.93,0.48],[2.62,19.93,5.22,0.88,2.07,...,4.79,0,0,0,0]]
RatecodeID: [[1,1,1,1,2,...,1,1,1,1,1],[1,1,1,1,1,...,1,1,1,1,1],...,[1,1,1,1,1,...,1,1,1,1,1],[1,2,1,1,1,...,1,1,1,1,1]]
store_and_fwd_flag: [["N","N","N","N","N",...,"N","N","N","N","N"],["N","N","N","N","N",...,"N","N","N","N","N"],...,["N","N","N","N","N",...,"N","N","N","N","N"],["N","N","N","N","N",...,"N","N","N","N","N"]]
PULocationID: [[151,239,236,193,193,...,233,114,255,246,50],[142,239,144,164,164,...,181,148,249,162,79],...,[234,234,162,237,161,...,186,148,230,68,186],[164,132,90,236,161,...,263,193,264,264,193]]
DOLocationID: [[239,246,236,193,193,...,151,255,61,246,142],[239,243,162,25,4,...,80,231,112,79,41],...,[170,137,236,163,161,...,249,232,237,137,90],[141,261,74,262,68,...,4,193,264,7,193]]
payment_type: [[1,1,1,2,2,...,1,1,1,1,1],[2,1,1,1,1,...,1,2,1,2,1],...,[2,1,1,1,2,...,1,2,2,1,1],[1,1,1,1,1,...,1,1,1,1,1]]
...


vx_df = vaex.from_arrow_table(arrow_table)
vx_df.head(5)


# Export a Vaex dataframe to a CSV file
df_json.export_csv('./output/population_data.csv')


# Export a Vaex dataframe to a csv file via Arrow
df_json.export_csv_arrow('./output/population_data.csv')


# Export a Vaex dataframe to a Parquet file
vx_df.export_parquet('./output/top_six_economies.parquet')


# Export data from a Vaex dataframe to a hdf5
df_by_arrow.export_hdf5('./output/yellow_tripdata_2019-01.hdf5', group='trip2019')

#	Country Name	Country Code	Year	Value
0	Arab World	ARB	1960	9.63881e+07
1	Arab World	ARB	1961	9.88825e+07
2	Arab World	ARB	1962	1.01474e+08

#	name	age	city
0	John	17	Edinburgh
1	Sally	33	Groningen

#	vendor_id	pickup_datetime	dropoff_datetime	passenger_count	payment_type	trip_distance	pickup_longitude	pickup_latitude	rate_code	dropoff_longitude	dropoff_latitude	fare_amount	surcharge	mta_tax	tip_amount	total_amount
0	VTS	2015-02-27 22:11:38.000000000	2015-02-27 22:22:51.000000000	5	1	2.26	-74.0066	40.7075	1	-74.0096	40.7346	10	0.5	0.5	2	13.3
1	VTS	2015-08-04 00:36:01.000000000	2015-08-04 00:47:11.000000000	1	1	5.13	-74.0075	40.7052	1	-73.9673	40.7552	16	0.5	0.5	3.46	20.76
2	VTS	2015-01-28 19:56:52.000000000	2015-01-28 20:03:27.000000000	1	2	1.89	-73.9719	40.7629	1	-73.9551	40.786	7.5	1	0.5	0	9.3

	Unnamed: 0	Country Name	Year	GDP (current US$)	GDP, PPP (current international $)	GDP per capita (current US$)	GDP growth (annual %)	Imports of goods and services (% of GDP)	Exports of goods and services (% of GDP)	Central government debt, total (% of GDP)	Total reserves (includes gold, current US$)	Unemployment, total (% of total labor force) (modeled ILO estimate)	Inflation, consumer prices (annual %)	Personal remittances, received (% of GDP)	Population, total	Population growth (annual %)	Life expectancy at birth, total (years)	Poverty headcount ratio at $1.90 a day (2011 PPP) (% of population)
0	33	United States	1991	6.158129e+12	6.158129e+12	24342.258905	-0.108265	10.125543	9.660905	44.061597	1.592729e+11	6.80	4.234964	0.021110	252981000.0	1.336261	75.365854	0.5
1	34	United States	1992	6.520327e+12	6.520327e+12	25418.990776	3.522441	10.241680	9.708915	46.050144	1.475259e+11	7.50	3.028820	0.027545	256514000.0	1.386886	75.617073	0.5
2	35	United States	1993	6.858559e+12	6.858559e+12	26387.293734	2.751781	10.497438	9.547180	48.246140	1.646202e+11	6.90	2.951657	0.026536	259919000.0	1.318680	75.419512	0.5
3	36	United States	1994	7.287236e+12	7.287236e+12	27694.853416	4.028793	11.162312	9.893147	47.353482	1.635906e+11	6.12	2.607442	0.026663	263126000.0	1.226296	75.619512	0.5
4	37	United States	1995	7.639749e+12	7.639749e+12	28690.875701	2.684217	11.814158	10.639224	47.209535	1.759954e+11	5.65	2.805420	0.028522	266278000.0	1.190787	75.621951	0.5

#	Unnamed: 0	Country Name	Year	GDP (current US$)	GDP, PPP (current international $)	GDP per capita (current US$)	GDP growth (annual %)	Imports of goods and services (% of GDP)	Exports of goods and services (% of GDP)	Central government debt, total (% of GDP)	Total reserves (includes gold, current US$)	Unemployment, total (% of total labor force) (modeled ILO estimate)	Inflation, consumer prices (annual %)	Personal remittances, received (% of GDP)	Population, total	Population growth (annual %)	Life expectancy at birth, total (years)	Poverty headcount ratio at $1.90 a day (2011 PPP) (% of population)	index
0	33	United States	1991	6.15813e+12	6.15813e+12	24342.3	-0.108265	10.1255	9.66091	44.0616	1.59273e+11	6.8	4.23496	0.0211103	2.52981e+08	1.33626	75.3659	0.5	0
1	34	United States	1992	6.52033e+12	6.52033e+12	25419	3.52244	10.2417	9.70891	46.0501	1.47526e+11	7.5	3.02882	0.0275446	2.56514e+08	1.38689	75.6171	0.5	1
2	35	United States	1993	6.85856e+12	6.85856e+12	26387.3	2.75178	10.4974	9.54718	48.2461	1.6462e+11	6.9	2.95166	0.0265362	2.59919e+08	1.31868	75.4195	0.5	2
3	36	United States	1994	7.28724e+12	7.28724e+12	27694.9	4.02879	11.1623	9.89315	47.3535	1.63591e+11	6.12	2.60744	0.0266631	2.63126e+08	1.2263	75.6195	0.5	3
4	37	United States	1995	7.63975e+12	7.63975e+12	28690.9	2.68422	11.8142	10.6392	47.2095	1.75995e+11	5.65	2.80542	0.0285219	2.66278e+08	1.19079	75.622	0.5	4

Table of Contents

1. Install the Library¶

2. Reading Data with Vaex¶

2.1 Reading CSV Files¶

(1) Read csv with `vaex.open()`¶

(2) Read csv with `vaex.from_csv()`¶

(3) Read csv with `vaex.read_csv()`¶

(4) Reading larger CSV¶

(5) Read very large CSV¶

2.2 Reading Json Files¶

2.3 Reading Parquet Files¶

2.4 Reading HDF5 and others¶

2.5 Reading Data from Clouds¶

2.6 Multiple data files¶

3. Construct DataFrame from In-memory Data Representations¶

3.1 Convert Pandas DataFrame to Vaex DataFrame¶

3.2 Convert an Arrow table to Vaex DataFrame¶

4. Exporting Data with Vaex¶

4.1 Exporting to CSV Files¶

4.2 Exporting to Parquet Files¶

4.3 Exporting to HDF5¶

5. Conclusion¶

#	VendorID	tpep_pickup_datetime	tpep_dropoff_datetime	passenger_count	trip_distance	RatecodeID	store_and_fwd_flag	PULocationID	DOLocationID	payment_type	fare_amount	extra	mta_tax	tip_amount	tolls_amount	improvement_surcharge	total_amount	congestion_surcharge
0	1	2019-01-01 00:46:40	2019-01-01 00:53:20	1	1.5	1	N	151	239	1	7	0.5	0.5	1.65	0	0.3	9.95	--
1	1	2019-01-01 00:59:47	2019-01-01 01:18:59	1	2.6	1	N	239	246	1	14	0.5	0.5	1	0	0.3	16.3	--
2	2	2018-12-21 13:48:30	2018-12-21 13:52:40	3	0	1	N	236	236	1	4.5	0.5	0.5	0	0	0.3	5.8	--
3	2	2018-11-28 15:52:25	2018-11-28 15:55:45	5	0	1	N	193	193	2	3.5	0.5	0.5	0	0	0.3	7.55	--
4	2	2018-11-28 15:56:57	2018-11-28 15:58:33	5	0	2	N	193	193	2	52	0	0.5	0	0	0.3	55.55	--
5	2	2018-11-28 16:25:49	2018-11-28 16:28:26	5	0	1	N	193	193	2	3.5	0.5	0.5	0	5.76	0.3	13.31	--
6	2	2018-11-28 16:29:37	2018-11-28 16:33:43	5	0	2	N	193	193	2	52	0	0.5	0	0	0.3	55.55	--
7	1	2019-01-01 00:21:28	2019-01-01 00:28:37	1	1.3	1	N	163	229	1	6.5	0.5	0.5	1.25	0	0.3	9.05	--
8	1	2019-01-01 00:32:01	2019-01-01 00:45:39	1	3.7	1	N	229	7	1	13.5	0.5	0.5	3.7	0	0.3	18.5	--
9	1	2019-01-01 00:57:32	2019-01-01 01:09:32	2	2.1	1	N	141	234	1	10	0.5	0.5	1.7	0	0.3	13	--

#	VendorID	tpep_pickup_datetime	tpep_dropoff_datetime	passenger_count	trip_distance	RatecodeID	store_and_fwd_flag	PULocationID	DOLocationID	payment_type	fare_amount	extra	mta_tax	tip_amount	improvement_surcharge	total_amount	congestion_surcharge
0	1	2022-01-01 00:35:40	2022-01-01 00:53:29	2	3.8	1	N	142	236	1	14.5	3	0.5	3.65	0.3	21.95	2.5
1	1	2022-01-01 00:33:43	2022-01-01 00:42:07	1	2.1	1	N	236	42	1	8	0.5	0.5	4	0.3	13.3	0
2	2	2022-01-01 00:53:21	2022-01-01 01:02:19	1	0.97	1	N	166	166	1	7.5	0.5	0.5	1.76	0.3	10.56	0
3	2	2022-01-01 00:25:21	2022-01-01 00:35:23	1	1.09	1	N	114	68	2	8	0.5	0.5	0	0.3	11.8	2.5
4	2	2022-01-01 00:36:48	2022-01-01 01:14:20	1	4.3	1	N	68	163	1	23.5	0.5	0.5	3	0.3	30.3	2.5

Table of Contents

1. Install the Library¶

2. Reading Data with Vaex¶

2.1 Reading CSV Files¶

(1) Read csv with vaex.open()¶

(2) Read csv with vaex.from_csv()¶

(3) Read csv with vaex.read_csv()¶

(4) Reading larger CSV¶

(5) Read very large CSV¶

2.2 Reading Json Files¶

2.3 Reading Parquet Files¶

2.4 Reading HDF5 and others¶

2.5 Reading Data from Clouds¶

2.6 Multiple data files¶

3. Construct DataFrame from In-memory Data Representations¶

3.1 Convert Pandas DataFrame to Vaex DataFrame¶

3.2 Convert an Arrow table to Vaex DataFrame¶

4. Exporting Data with Vaex¶

4.1 Exporting to CSV Files¶

4.2 Exporting to Parquet Files¶

4.3 Exporting to HDF5¶

5. Conclusion¶

(1) Read csv with `vaex.open()`¶

(2) Read csv with `vaex.from_csv()`¶

(3) Read csv with `vaex.read_csv()`¶