compare pyarrow vs panda read_parquet

This commit is contained in:
ChangCL
2025-08-09 23:56:22 +08:00
parent 4177251e20
commit 581fdb7c7c

View File

@@ -13,17 +13,34 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"id": "4b3733cd-3c59-4cb4-a343-4cf75ea36ceb",
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "NameError",
"evalue": "name 'time' is not defined",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpandas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpd\u001b[39;00m\n\u001b[32m 4\u001b[39m URL_DATA = \u001b[33m'\u001b[39m\u001b[33mhttps://storage.data.gov.my/transportation/cars_2025.parquet\u001b[39m\u001b[33m'\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m start_time = \u001b[43mtime\u001b[49m.perf_counter()\n\u001b[32m 7\u001b[39m df = pd.read_parquet(URL_DATA)\n\u001b[32m 8\u001b[39m end_time = time.perf_counter()\n",
"\u001b[31mNameError\u001b[39m: name 'time' is not defined"
]
}
],
"source": [
"# If not already installed, do: pip install pandas fastparquet\n",
"import pandas as pd\n",
"\n",
"URL_DATA = 'https://storage.data.gov.my/transportation/cars_2025.parquet'\n",
"\n",
"start_time = time.perf_counter()\n",
"df = pd.read_parquet(URL_DATA)\n",
"end_time = time.perf_counter()\n",
"total_time = end_time - start_time\n",
"print(f'{total_time:0.2f}')\n",
"\n",
"if 'date' in df.columns: df['date'] = pd.to_datetime(df['date'])\n",
"\n",
"print(df)"