From 0d770a9fd2baf87ea5266021fde9b2d00c308ea8 Mon Sep 17 00:00:00 2001
From: Marco Frailis <marco.frailis@inaf.it>
Date: Thu, 31 Oct 2019 09:26:44 +0100
Subject: [PATCH] Adding performance test for the chunk example

---
 hdf5_example.ipynb | 95 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 81 insertions(+), 14 deletions(-)

diff --git a/hdf5_example.ipynb b/hdf5_example.ipynb
index 09739b7..fb5b4c0 100644
--- a/hdf5_example.ipynb
+++ b/hdf5_example.ipynb
@@ -29,7 +29,7 @@
     "\n",
     "f[\"dataset_one\"] = data\n",
     "\n",
-    "# We now retrieve the dataset from file\n",
+    "# We now retrieve the dataset from file (it is still in memory in fact)\n",
     "dset = f[\"dataset_one\"]\n"
    ]
   },
@@ -51,7 +51,7 @@
    "source": [
     "### Dataset slicing\n",
     "\n",
-    "Datasets provide analogous slicing operations as numpy arrays (with h5py). But these selections are translated by h5py to portion of the dataset and then HDF5 read the data form \"disk\". Slicing into a dataset object returns a NumpPy array.\n"
+    "Datasets provide analogous slicing operations as numpy arrays (with h5py). But these selections are translated by h5py to portion of the dataset and then HDF5 reads the data form \"disk\". Slicing into a dataset object returns a NumpPy array.\n"
    ]
   },
   {
@@ -60,6 +60,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# The ellipses means \"as many ':' as needed\"\n",
+    "# here we use it to get a numpy array of the\n",
+    "# entire dataset\n",
     "out = dset[...]\n",
     "\n",
     "print(out)\n",
@@ -73,7 +76,13 @@
    "outputs": [],
    "source": [
     "dset[1:5, 1] = 0.0\n",
-    "dset[...]"
+    "dset[...]\n",
+    "\n",
+    "#but we cannot use negative steps with a dataset\n",
+    "try:\n",
+    "    dset[0,::-1]\n",
+    "except: \n",
+    "    print('No no no!')"
    ]
   },
   {
@@ -82,7 +91,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# random 2d distribution\n",
+    "# random 2d distribution in the range (-1,1)\n",
     "data = np.random.rand(15, 10)*2 - 1\n",
     "\n",
     "dset = f.create_dataset('random', data=data)\n",
@@ -91,7 +100,7 @@
     "out = dset[0:10:2, :2]\n",
     "print(out)\n",
     "\n",
-    "# clipping to zero all negative values\n",
+    "# clipping to zero all negative values, using boolean indexing\n",
     "dset[data<0] = 0"
    ]
   },
@@ -107,7 +116,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "dset = f.create_dataset('dataset_two', (1,1000), dtype=np.float32, \n",
@@ -118,12 +129,9 @@
     "num_rows = dset.shape[0]\n",
     "dset.resize((num_rows+a.shape[0], 1000))\n",
     "\n",
-    "for row in a:\n",
-    "    dset[num_rows,:] = row\n",
-    "    num_rows +=1\n",
+    "dset[num_rows:] = a\n",
     "\n",
-    "print(dset[1000,:20])\n",
-    "\n"
+    "print(dset[1:5,:20])\n"
    ]
   },
   {
@@ -200,7 +208,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "f = h5py.File(\"testdata.hdf5\")\n",
+    "f = h5py.File(\"testdata.hdf5\",'a')\n",
     "dt = np.dtype([('source_id', np.uint32), ('ra', np.float32), ('dec', np.float32), ('magnitude', np.float64)])\n",
     "\n",
     "grp = f.create_group('source_catalog/det11')\n",
@@ -260,7 +268,57 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dset = f.create_dataset('chunked', (10,2048,2048), dtype=np.uint16, chunks=(1,64,64))"
+    "rdata = np.random.randint(0,2**16,(100,2048,2048), dtype=np.uint16)\n",
+    "\n",
+    "f = h5py.File('image_sequence.hdf5','w')\n",
+    "dset = f.create_dataset('nochunk', data=rdata)\n",
+    "f.flush()\n",
+    "f.close()\n",
+    "\n",
+    "f = h5py.File('image_sequence_chunked.hdf5','w')\n",
+    "dset = f.create_dataset('chunked', data=rdata, chunks=(100,64,64))\n",
+    "f.flush()\n",
+    "f.close()\n",
+    "\n",
+    "f = h5py.File('image_sequence.hdf5','r')\n",
+    "dset = f['nochunk']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "for i in range(32):\n",
+    "    for j in range(32):\n",
+    "        block = dset[:,64*i:64*(i+1), 64*j:64*(j+1)]\n",
+    "        np.median(block, 0)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "f.close()\n",
+    "f = h5py.File('image_sequence_chunked.hdf5','r')\n",
+    "dset = f['chunked']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "for i in range(32):\n",
+    "    for j in range(32):\n",
+    "        block = dset[:,64*i:64*(i+1), 64*j:64*(j+1)]\n",
+    "        np.median(block, 0)"
    ]
   },
   {
@@ -269,11 +327,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "f.close()\n",
+    "f = h5py.File('image_sequence_chunked.hdf5','a')\n",
     "dset = f.require_dataset('auto_chunked', (2048,2048), dtype=np.float32, compression=\"gzip\")\n",
     "print(dset.compression)\n",
     "print(dset.compression_opts)\n",
     "print(dset.chunks)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -292,7 +359,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.0"
+   "version": "3.7.4"
   }
  },
  "nbformat": 4,
-- 
GitLab