Nanogallery Image Wrangler

Jan 26, 2020

The reason for rebuilding my site with Hugo was to add a nice gallery to display the fruits of my new photography hobby. I built a Hugo shortcode for the wonderful Nanogallery, but I still needed a way to automate the process of creating thumbnails, resizing images for the web, and collecting metadata.

The script below handles all the gruntwork. Point it at a directory of images and it will:

Create thumbnails at ~ 250 x 225 px in $DIR/thumbnails
Scale images to ~ 1400 x 1200 px and put the originals in $DIR/originals
Create content.json with metadata scraped from the images

It can be run incrementally to pick up new images or handle deletions.

It requires Python >= 3.6; the only dependency is the Pillow library, a Python3 fork of PIL. I recommend installing that in a virtual environment.

As of 2020-01-31 it’s functional, but still very much a work in progress.

$ ../pyenv/bin/python scripts/gallery.py --help
usage: gallery.py [-h] [--dry-run] [--force] [--force-resize] DIR

positional arguments:
  DIR             Directory holding images and content.json

optional arguments:
  -h, --help      show this help message and exit
  --dry-run       Don't modify any files
  --force         Reprocess all files
  --force-resize  Reprocess all files and recreate maxpect images

scripts/gallery.py:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397


#!/usr/bin/env python3
#
# Copyright (C) 2020-2022 Dirk Bergstrom <dirk@otisbean.com>. All Rights Reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#

import json
import re
import datetime
from io import BytesIO
from pathlib import Path
import argparse
import sys
import logging
import binascii
from fractions import Fraction

from PIL import Image, ImageFilter, ExifTags, TiffImagePlugin
from titlecase import titlecase


CONTENT_FILE = "content.json"
ORDERING_FILE = "image-order.txt"
MAX_WIDTH = 1600
MAX_HEIGHT = 1400
THUMBNAIL_HEIGHT = 225

logging.basicConfig(level=logging.DEBUG,
    format="%(asctime)s %(levelname)s: %(message)s")


def isoformat(timeint):
    return datetime.datetime.fromtimestamp(timeint) \
        .replace(microsecond=0).isoformat().replace("T", " ")


def exif_to_isodate(exif_date):
    dt = datetime.datetime.strptime(exif_date, "%Y:%m:%d %H:%M:%S")
    return dt.replace(microsecond=0).isoformat().replace("T", " ")


def fixexif(val):
    """Remove non-ASCII characters and strip leading & trailing
    whitespace.

    The text fields in EXIF data are full of garbage.
    """
    return "".join([x for x in val if (x.isprintable() or x.isspace())]).strip()


def read_exif_metadata(img, data):
    """Read EXIF from the photo and map it to Nanogallery data.

    Model, Make & LensModel => exifModel
    Flash => exifFlash  (as "" or "Flash")
    FocalLength => exifFocalLength  (as an integer)
    FNumber => exifFStop  (as '.1f')
    ExposureTime => exifExposure  (as either int seconds or a fraction)
    ISOSpeedRatings => ExifIso
    DateTimeOriginal => exifTime
    UserComment => description  ("Caption" field in DigiKam)
    DocumentName => title  ("Name" field in DigiKam)
    """
    # EXIF tag data is a disgusting swamp of badly formatted information
    raw_exif = img._getexif()
    if not raw_exif:
        logging.info("No exif in photo")
        return
    exif_tags = {ExifTags.TAGS.get(k, k): v for k, v in raw_exif.items()}

    mod = exif_tags.get('Model')
    if mod:
        mod = fixexif(mod)
        make = exif_tags.get('Make')
        if make:
            make = titlecase(fixexif(make))
            mod = "{} {}".format(make, mod)
        lens = exif_tags.get('LensModel')
        if lens:
            mod = "{}; {}".format(mod, fixexif(lens))
        data['exifModel'] = mod

    # exif flash is a bitmask where the first bit is "did it fire"
    flash = exif_tags.get('Flash', 0)
    data['exifFlash'] = "Flash" if (flash & 1) else ""

    fl = exif_tags.get('FocalLength')
    if fl:
        if isinstance(fl, tuple):
            # A tuple.  One hopes that the first element is always the focal
            # length, and the second 1, but...
            data['exifFocalLength'] = int(fl[0] / fl[1])
        else:
            # Hope it's a number...
            data['exifFocalLength'] = int(fl)

    fn = exif_tags.get('FNumber')
    if fn:
        if isinstance(fn, tuple):
            # Another tuple
            data['exifFStop'] = "{:.1f}".format(fn[0] / fn[1])
        else:
            data['exifFStop'] = "{:.1f}".format(float(fn))

    et = exif_tags.get('ExposureTime')
    if et:
        if isinstance(et, TiffImagePlugin.IFDRational):
            if (float(et)) > 1:
                et = float(et)
            else:
                et = (et.numerator, et.denominator)
        if isinstance(et, tuple):
            if et[0] == et[1]:
                et = "1"
            elif et[1] == 1:
                # Integer number of seconds
                et = f"{et[0]}"
            else:
                # Format as a fraction.
                # FIXME Should do a better job with times > 1s
                et = f"{et[0]}/{et[1]}"
        elif isinstance(et, float):
            # Try to turn floats into something that looks like it came out
            # of a camera UI.
            # <1s: 1/250
            # 1s - 2s: 1.33"
            # > 2s: 2.4"
            # Integer seconds: 4"
            if et == int(et):
                et = f'{et:.0f}'
            if 1 < et < 2:
                et = f'{et:.2f}'
            elif et >= 2:
                et = f'{et:.1f}'
            else:
                et = str(Fraction(et).limit_denominator())
        else:
            try:
                et = str(et)
            except Exception:
                et = "???"
        data['exifExposure'] = et

    iso = exif_tags.get('ISOSpeedRatings')
    if iso:
        data['exifIso'] = iso

    dto = exif_tags.get('DateTimeOriginal')
    if dto:
        data['exifTime'] = exif_to_isodate(dto)

    # UserComment => description
    # Along the way we convert newlines to <br> tags and linkify URLs
    uc = exif_tags.get('UserComment')
    if uc:
        uc = fixexif(uc.decode())
        if uc.startswith("ASCII"):
            # As written by DigiKam the UserComment field has
            # a prefix of 'ASCII\x00\x00\x00'
            uc = uc[5:]
        if uc.startswith("UNICODE"):
            # Or sometimes 'UNICODE\x00\x00\x00'
            uc = uc[7:]
        uc = re.sub("\n", "\n<br>", uc)
        uc = re.sub(r"(https?://\S+)", r'<a href="\1">\1</a>', uc)
        data['description'] = uc

    dn = exif_tags.get('DocumentName')
    if (dn and not
            re.search(r'\.jpg', dn, re.IGNORECASE) and not
            re.search(r'\d{5}', dn)):
        # Looks like an actual title, not just a filename
        data['title'] = dn


def doit(directory, force, force_resize, sorting, include_originals, dry_run):
    """Do the thing.
    """
    orig_dir = directory / "originals"
    if not orig_dir.exists():
        print("Expected to find a sub-directory named 'originals' "
            "containing image files.", file=sys.stderr)
        sys.exit(1)
    rsync_exclude_file = directory / "exclude-originals.txt"
    if rsync_exclude_file.exists():
        include_originals = False
    content_file = directory / CONTENT_FILE
    if not content_file.exists():
        content = []
    else:
        with content_file.open() as cf:
            content = json.load(cf)
    old = {c['filename']: c for c in content}
    new = []
    done = []
    for path in sorted(orig_dir.glob('*.jpg')):
        if path.is_file():
            # A candidate
            mtime = isoformat(path.stat().st_mtime)
            oe = old.get(path.name)
            if (force or oe is None or oe.get('mtime') != mtime):
                # File is new or changed
                newfilespec = dict(
                    filename=path.name,
                    mtime=mtime,
                    path=path,
                    ID=re.sub(r'[^\w-]', '-', path.stem),
                )
                if include_originals:
                    newfilespec["downloadURL"] = f"{orig_dir.name}/{path.name}"
                new.append(newfilespec)
            else:
                # We have up-to-date info for this file
                done.append(oe)

    if len(new) == 0:
        logging.info("No changes, exiting.")
        return

    resized_dir = directory / "resized"
    if not resized_dir.exists():
        resized_dir.mkdir()

    # Process new files
    for data in new:
        path = data.pop("path")
        logging.info("Processing %s", path.name)
        img = Image.open(path)

        # Save the EXIF data so we can write it back out
        exif_bytes = img.info.get('exif', b'')

        if img.width > MAX_WIDTH or img.height > MAX_HEIGHT:
            # Image too large, need maxpect image for web display
            logging.info("Image too large (%d x %d)", img.width, img.height)
            resized_name = f"web-{path.name}"
            resized_path = resized_dir / resized_name
            if resized_path.exists() and not force_resize:
                logging.info("Reading size of existing maxpect")
                maxpect = Image.open(resized_path)
            else:
                logging.info("Making maxpect")
                maxpect = img.copy()
                # thumbnail() method modifies image, preserves aspect ratio.
                # Image.LANCZOS is the best quality and seems plenty fast
                # Image.BICUBIC is faster but lower quality.
                maxpect.thumbnail(
                    (MAX_WIDTH, MAX_HEIGHT), resample=Image.LANCZOS)
                logging.debug('Saving maxpect as "%s"', resized_path)
                if not dry_run:
                    maxpect.save(resized_path,
                        quality=90,
                        progressive=True,
                        optimized=True,
                        exif=exif_bytes,
                        icc_profile=img.info.get('icc_profile'))
            data["imgWidth"] = maxpect.width
            data["imgHeight"] = maxpect.height
            data["src"] = f'{resized_dir.name}/{resized_name}'
        else:
            data["src"] = data["downloadURL"]
            data["imgWidth"] = img.width
            data["imgHeight"] = img.height

        read_exif_metadata(img, data)
        if "title" not in data:
            # Nothing in EXIF, use the filename
            if not re.search(r'\d{5}', path.name):
                # Doesn't look like a serial number, assume it's text and try
                # to make it pretty.
                data['title'] = titlecase(re.sub(r'[_-]', ' ', path.stem))
            else:
                data['title'] = path.name

        # make thumbnail (cropping to 90%)
        thumb_path = resized_dir / f"thumb-{path.name}"
        logging.info("Making thumbnail %s", thumb_path)
        crop_coords = (
            img.width / 20,
            img.height / 20,
            img.width - img.width / 20,
            img.height - img.height / 20
        )
        thumb = img.crop(crop_coords)
        hratio = thumb.height / THUMBNAIL_HEIGHT
        thumb.thumbnail((thumb.width * hratio, THUMBNAIL_HEIGHT))
        if not dry_run:
            thumb.save(thumb_path)
        data["srct"] = f"{resized_dir.name}/{thumb_path.name}"
        data["imgtWidth"] = thumb.width
        data["imgtHeight"] = thumb.height

        # Get dominant colors
        #  Resize to ~20x20, blur, create gif, base64 encode
        # (Fancier method: https://github.com/fengsp/color-thief-py)
        logging.info("Creating 'dominant colors' gif")
        thumb.thumbnail((15, 15))
        blurred = thumb.filter(filter=ImageFilter.BLUR)
        bio = BytesIO()
        blurred.save(bio, format="GIF")
        gif_encoded = binascii.b2a_base64(bio.getvalue()).decode('utf8')
        # Add to new dict
        data['imageDominantColors'] = f"data:image/gif;base64,{gif_encoded}"

        if not include_originals and "downloadURL" in data:
            del(data["downloadURL"])

        done.append(data)

    # FIXME Remove orphaned thumbs and originals

    ordering_file = directory / ORDERING_FILE
    if ordering_file.exists():
        # Put images in the order given in the file
        with ordering_file.open() as of:
            image_order = [fname.strip() for fname in of.readlines()]
        def getindex(entry):
            try:
                return image_order.index(entry['filename'])
            except ValueError:
                # Unknown files go at the end
                return 9999
        done.sort(key=getindex)
    else:
        if sorting == 'alnum':
            # Sort images by title
            done.sort(key=lambda x: x.get('title', x['filename']))
        elif sorting in ('revchron', 'chron'):
            # Sort images by exif time & mtime
            done.sort(key=lambda x: x.get('exifTime', x['mtime']),
                      reverse=(sorting == 'revchron'))
        else:
            raise Exception(f"Unsupported sort order {sorting}")

    # Write new CONTENT_FILE
    if dry_run:
        print(json.dumps(done, indent=1), file=sys.stderr)
    else:
        # Handle semaphore file for display/exclude originals
        if include_originals and rsync_exclude_file.exists():
            rsync_exclude_file.unlink()
        elif not include_originals:
            with open(rsync_exclude_file, "w") as ref:
                print("H originals/**\n", file=ref)
        # Make symlink to latest thumbnail image
        latest = Path(done[0]['srct']).name
        symlink_path = resized_dir / 'latest.jpg'
        try:
            if symlink_path.exists() or symlink_path.is_symlink():
                logging.debug("unlinking old symlink %s", symlink_path)
                symlink_path.unlink()
            logging.info("Creating 'latest.jpg' symlink %s -> %s", symlink_path, latest)
            (symlink_path).symlink_to(latest)
        except OSError as e:
            logging.error("Failed to create 'latest.jpg' symlink: " + str(e))
        # Write JSON
        logging.info("Writing %s", directory / CONTENT_FILE)
        with (directory / CONTENT_FILE).open(mode='w') as fp:
            json.dump(done, fp, indent=1)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        epilog=f"If the file `{ORDERING_FILE}` is present in the directory, "
        "images will be presented in the order listed there.")
    parser.add_argument("directory", metavar="DIR", type=str,
        help="Directory holding images and content.json")
    parser.add_argument("--sorting", choices=["revchron", "chron", "alnum"],
        default="revchron", help="Sort order")
    parser.add_argument("--exclude-originals", action="store_true",
        help="Don't publish original images or download links.")
    parser.add_argument("--dry-run", action="store_true",
        help="Don't modify any files")
    parser.add_argument("--force", action="store_true",
        help="Reprocess all files")
    parser.add_argument("--force-resize", action="store_true",
        help="Reprocess all files and recreate maxpect images")
    args = parser.parse_args()
    doit(Path(args.directory),
         (args.force or args.force_resize),
         args.force_resize,
         args.sorting,
         not args.exclude_originals,
         args.dry_run)