diff --git a/block/qcow2.c b/block/qcow2.c
index 7fb2730f09..48d22f48d5 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -788,6 +788,7 @@ static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
     BDRVQcow2State *s = bs->opaque;
     uint64_t combined_cache_size, l2_cache_max_setting;
     bool l2_cache_size_set, refcount_cache_size_set, combined_cache_size_set;
+    bool l2_cache_entry_size_set;
     int min_refcount_cache = MIN_REFCOUNT_CACHE_SIZE * s->cluster_size;
     uint64_t virtual_disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
     uint64_t max_l2_cache = virtual_disk_size / (s->cluster_size / 8);
@@ -795,6 +796,7 @@ static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
     combined_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_CACHE_SIZE);
     l2_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_SIZE);
     refcount_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
+    l2_cache_entry_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_ENTRY_SIZE);
 
     combined_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_CACHE_SIZE, 0);
     l2_cache_max_setting = qemu_opt_get_size(opts, QCOW2_OPT_L2_CACHE_SIZE,
@@ -841,6 +843,16 @@ static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
             }
         }
     }
+
+    /*
+     * If the L2 cache is not enough to cover the whole disk then
+     * default to 4KB entries. Smaller entries reduce the cost of
+     * loads and evictions and increase I/O performance.
+     */
+    if (*l2_cache_size < max_l2_cache && !l2_cache_entry_size_set) {
+        *l2_cache_entry_size = MIN(s->cluster_size, 4096);
+    }
+
     /* l2_cache_size and refcount_cache_size are ensured to have at least
      * their minimum values in qcow2_update_options_prepare() */
 
diff --git a/docs/qcow2-cache.txt b/docs/qcow2-cache.txt
index c1e7751fea..d57f409861 100644
--- a/docs/qcow2-cache.txt
+++ b/docs/qcow2-cache.txt
@@ -158,10 +158,10 @@ refcount cache is as small as possible unless overridden by the user.
 
 Using smaller cache entries
 ---------------------------
-The qcow2 L2 cache stores complete tables by default. This means that
-if QEMU needs an entry from an L2 table then the whole table is read
-from disk and is kept in the cache. If the cache is full then a
-complete table needs to be evicted first.
+The qcow2 L2 cache can store complete tables. This means that if QEMU
+needs an entry from an L2 table then the whole table is read from disk
+and is kept in the cache. If the cache is full then a complete table
+needs to be evicted first.
 
 This can be inefficient with large cluster sizes since it results in
 more disk I/O and wastes more cache memory.
@@ -172,6 +172,9 @@ it smaller than the cluster size. This can be configured using the
 
    -drive file=hd.qcow2,l2-cache-size=2097152,l2-cache-entry-size=4096
 
+Since QEMU 4.0 the value of l2-cache-entry-size defaults to 4KB (or
+the cluster size if it's smaller).
+
 Some things to take into account:
 
  - The L2 cache entry size has the same restrictions as the cluster
@@ -185,7 +188,8 @@ Some things to take into account:
 
  - Try different entry sizes to see which one gives faster performance
    in your case. The block size of the host filesystem is generally a
-   good default (usually 4096 bytes in the case of ext4).
+   good default (usually 4096 bytes in the case of ext4, hence the
+   default).
 
  - Only the L2 cache can be configured this way. The refcount cache
    always uses the cluster size as the entry size.
@@ -194,7 +198,8 @@ Some things to take into account:
    (as explained in the "Choosing the right cache sizes" and "How to
    configure the cache sizes" sections in this document) then none of
    this is necessary and you can omit the "l2-cache-entry-size"
-   parameter altogether.
+   parameter altogether. In this case QEMU makes the entry size
+   equal to the cluster size by default.
 
 
 Reducing the memory usage