compare pyarrow vs panda read_parquet

This commit is contained in:
ChangCL
2025-08-09 23:58:16 +08:00
parent 581fdb7c7c
commit 8082155709

View File

@@ -13,25 +13,49 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"id": "4b3733cd-3c59-4cb4-a343-4cf75ea36ceb",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'time' is not defined",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpandas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpd\u001b[39;00m\n\u001b[32m 4\u001b[39m URL_DATA = \u001b[33m'\u001b[39m\u001b[33mhttps://storage.data.gov.my/transportation/cars_2025.parquet\u001b[39m\u001b[33m'\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m start_time = \u001b[43mtime\u001b[49m.perf_counter()\n\u001b[32m 7\u001b[39m df = pd.read_parquet(URL_DATA)\n\u001b[32m 8\u001b[39m end_time = time.perf_counter()\n",
"\u001b[31mNameError\u001b[39m: name 'time' is not defined"
"name": "stdout",
"output_type": "stream",
"text": [
"0.53\n",
" date_reg type maker model colour fuel \\\n",
"0 2025-01-01 motokar BYD Seal white electric \n",
"1 2025-01-01 window_van Cam Placer-X yellow greendiesel \n",
"2 2025-01-01 jip Chery Jaecoo J7 green petrol \n",
"3 2025-01-01 jip Chery Jaecoo J7 silver petrol \n",
"4 2025-01-01 jip Chery Tiggo grey petrol \n",
"... ... ... ... ... ... ... \n",
"396824 2025-06-30 window_van Zeekr 009 white electric \n",
"396825 2025-06-30 jip Zeekr X beige electric \n",
"396826 2025-06-30 jip Zeekr X grey electric \n",
"396827 2025-06-30 jip Zeekr X grey electric \n",
"396828 2025-06-30 jip Zeekr X green electric \n",
"\n",
" state \n",
"0 Rakan Niaga \n",
"1 Johor \n",
"2 Rakan Niaga \n",
"3 Rakan Niaga \n",
"4 Rakan Niaga \n",
"... ... \n",
"396824 W.P. Kuala Lumpur \n",
"396825 W.P. Kuala Lumpur \n",
"396826 W.P. Kuala Lumpur \n",
"396827 W.P. Kuala Lumpur \n",
"396828 W.P. Kuala Lumpur \n",
"\n",
"[396829 rows x 7 columns]\n"
]
}
],
"source": [
"# If not already installed, do: pip install pandas fastparquet\n",
"import pandas as pd\n",
"import time\n",
"\n",
"URL_DATA = 'https://storage.data.gov.my/transportation/cars_2025.parquet'\n",
"\n",
@@ -48,7 +72,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"id": "72c39de4-4b98-4dca-8152-3a51f84d86f5",
"metadata": {},
"outputs": [
@@ -56,6 +80,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"0.18\n",
" date_reg type maker model colour fuel \\\n",
"0 2025-01-01 motokar BYD Seal white electric \n",
"1 2025-01-01 window_van Cam Placer-X yellow greendiesel \n",
@@ -88,11 +113,17 @@
],
"source": [
"import pandas as pd\n",
"import time\n",
"\n",
"URL_DATA = 'https://storage.data.gov.my/transportation/cars_2025.parquet'\n",
"\n",
"# Assuming 'your_file.parquet' is the path to your Parquet file\n",
"start_time = time.perf_counter()\n",
"df = pd.read_parquet(URL_DATA, engine='pyarrow')\n",
"end_time = time.perf_counter()\n",
"total_time = end_time - start_time\n",
"print(f'{total_time:0.2f}')\n",
"\n",
"if 'date' in df.columns: df['date'] = pd.to_datetime(df['date'])\n",
"\n",
"# print(df.head())\n",