Files
h4b-image-optim/includes/class-picture-tag.php
Henk 4cd1390a94 feat: migrate-from-smush + Picture-tag rewriter (v0.2.0)
Unblocks production use on sites previously running Smush.

migrate-from-smush:
  - Reads wp-smpro-smush-data postmeta, writes _h4b_img_optim marker
  - --dry-run / --force-rescan / --remove-smush-meta / --limit flags
  - Verified: 100 attachments migrated cleanly on dev.rds.ink,
    bulk count drops from 734 → 634

Picture_Tag rewriter:
  - Hooks the_content + post_thumbnail_html + widget_text + Elementor
    frontend + wp_get_attachment_image at priority 99
  - Wraps <img> in <picture><source avif><source webp><img></picture>
    when sibling files exist
  - Double-wrap protection via byte-range tracking of existing <picture> blocks
  - Per-image opt-out via data-no-h4b attribute
  - Cached sibling lookups per request
  - 8 edge-case tests pass

LOC: 2480 (was 1997). Adds class-cli-migrate.php (193 LOC) and
class-picture-tag.php (284 LOC).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-19 13:50:41 +10:00

285 lines
8.9 KiB
PHP

<?php
/**
* Rewrites <img src="…jpg|png"> tags to <picture> with WebP + AVIF sources.
*
* Design notes
* ============
* 1. We hook `the_content` (post body), `post_thumbnail_html` (featured image)
* and `wp_get_attachment_image` (most theme/plugin image calls). For Elementor
* we register on `elementor/frontend/the_content` as well — Elementor pipes
* the_content through its own filter chain in some templates.
*
* 2. Sibling files MUST exist on disk for us to emit a <source>. If only WebP
* exists, we emit just the WebP source; same for AVIF. We never emit a
* <source> pointing at a non-existent file (would 404 in browsers).
*
* 3. Sibling naming convention is `<original>.webp` / `<original>.avif`.
* That matches what Format_Generator produces AND what Smush already
* produces in `wp-content/smush-webp/<rel>.webp` (we mirror-resolve too).
*
* 4. We DO NOT rewrite:
* - <img> already inside a <picture> (avoid double-wrapping)
* - <img> with data-no-h4b attribute
* - <img> with no usable src
* - srcset URLs (sources list is more semantic; let the browser pick)
*
* 5. Keep ALL original <img> attributes intact (class, alt, srcset, sizes,
* width, height, loading, decoding, fetchpriority, …). The <img> remains
* the visible fallback for browsers that don't understand <picture>.
*
* 6. Performance: build the regex once, iterate once over each filter call.
* Cache the "do the siblings exist" check per request (static array).
*
* @package H4B\ImageOptim
*/
namespace H4B\ImageOptim;
if ( ! defined( 'ABSPATH' ) ) {
exit;
}
final class Picture_Tag {
/** @var array<string,array{webp:?string,avif:?string}> path → urls */
private static array $sibling_cache = [];
public static function register(): void {
if ( ! Settings::get( 'rewrite_content_images', true ) ) {
return;
}
// Run LATE so other filters get to manipulate the raw <img> first.
add_filter( 'the_content', [ self::class, 'rewrite_html' ], 99 );
add_filter( 'post_thumbnail_html', [ self::class, 'rewrite_html' ], 99 );
add_filter( 'widget_text', [ self::class, 'rewrite_html' ], 99 );
// Elementor's frontend content filter
add_filter( 'elementor/frontend/the_content', [ self::class, 'rewrite_html' ], 99 );
// Single attachment_image (commonly used by themes + WooCommerce)
add_filter( 'wp_get_attachment_image', [ self::class, 'rewrite_html' ], 99, 5 );
}
/**
* Rewrite all <img> tags in $html to <picture> wrappers where siblings exist.
*
* Skips <img> tags that are already inside an existing <picture>…</picture>
* (whether the surrounding <picture> existed in the input or was added by
* this same filter pass).
*/
public static function rewrite_html( $html, ...$_extra ): string {
// Coerce to string; some filters can pass non-string in edge cases.
if ( ! is_string( $html ) || $html === '' ) {
return is_string( $html ) ? $html : '';
}
// Quick reject: no <img tag?
if ( stripos( $html, '<img' ) === false ) {
return $html;
}
// 1. Identify byte-range spans of existing <picture>…</picture> blocks
// so we never touch <img> tags inside them.
$picture_ranges = self::find_picture_ranges( $html );
// 2. Single-pass walk: rebuild output with non-img bytes copied through
// and <img> tags rewritten if they sit outside any picture range AND
// have qualifying siblings.
$out = '';
$cursor = 0;
$pattern = '#<img\\b[^>]*>#i';
if ( ! preg_match_all( $pattern, $html, $matches, PREG_OFFSET_CAPTURE ) ) {
return $html;
}
foreach ( $matches[0] as $match ) {
[ $img_tag, $offset ] = $match;
$end = $offset + strlen( $img_tag );
// Copy through bytes preceding this <img>
$out .= substr( $html, $cursor, $offset - $cursor );
if ( self::is_inside_range( $offset, $picture_ranges ) ) {
// Already inside an existing <picture>; leave alone.
$out .= $img_tag;
} else {
$out .= self::maybe_wrap( $img_tag );
}
$cursor = $end;
}
$out .= substr( $html, $cursor );
return $out;
}
/**
* Build a list of [start, end_exclusive] byte ranges covering every
* <picture>…</picture> block in $html.
*
* @return array<int, array{0:int,1:int}>
*/
private static function find_picture_ranges( string $html ): array {
$ranges = [];
$offset = 0;
while ( true ) {
$open = stripos( $html, '<picture', $offset );
if ( $open === false ) {
break;
}
$close = stripos( $html, '</picture>', $open );
if ( $close === false ) {
break;
}
$ranges[] = [ $open, $close + strlen( '</picture>' ) ];
$offset = $close + 1;
}
return $ranges;
}
/**
* @param array<int, array{0:int,1:int}> $ranges
*/
private static function is_inside_range( int $offset, array $ranges ): bool {
foreach ( $ranges as [ $start, $end ] ) {
if ( $offset >= $start && $offset < $end ) {
return true;
}
}
return false;
}
/**
* Decide whether to wrap a single <img> tag.
*/
private static function maybe_wrap( string $img_tag ): string {
// Skip if explicitly opted out
if ( strpos( $img_tag, 'data-no-h4b' ) !== false ) {
return $img_tag;
}
// Extract src
if ( ! preg_match( '#\\bsrc=([\'"])(.+?)\\1#i', $img_tag, $sm ) ) {
return $img_tag;
}
$src = $sm[2];
// Only handle http(s) / protocol-relative / site-relative URLs
$siblings = self::resolve_siblings( $src );
if ( ! $siblings['webp'] && ! $siblings['avif'] ) {
return $img_tag;
}
// Build srcset for picture sources if we can. For simplicity v0.1
// emits a plain source URL (srcset handling for sizes is a v0.2 task).
$sources = '';
if ( $siblings['avif'] ) {
$sources .= sprintf(
"<source type=\"image/avif\" srcset=\"%s\">",
esc_attr( $siblings['avif'] )
);
}
if ( $siblings['webp'] ) {
$sources .= sprintf(
"<source type=\"image/webp\" srcset=\"%s\">",
esc_attr( $siblings['webp'] )
);
}
return '<picture>' . $sources . $img_tag . '</picture>';
}
/**
* Given an image URL, return WebP + AVIF sibling URLs if they exist on disk.
*
* @return array{webp:?string, avif:?string}
*/
private static function resolve_siblings( string $url ): array {
if ( isset( self::$sibling_cache[ $url ] ) ) {
return self::$sibling_cache[ $url ];
}
$result = [ 'webp' => null, 'avif' => null ];
// Only act on JPG / JPEG / PNG
if ( ! preg_match( '#\\.(jpe?g|png)(\\?.*)?$#i', $url ) ) {
return self::$sibling_cache[ $url ] = $result;
}
// Strip query string for filesystem lookup
$url_clean = strtok( $url, '?' );
// Convert URL → absolute path on disk
$path = self::url_to_path( $url_clean );
if ( $path === null ) {
return self::$sibling_cache[ $url ] = $result;
}
// Candidate 1: alongside the source (what we generate)
$webp_alongside = $path . '.webp';
$avif_alongside = $path . '.avif';
// Candidate 2: Smush's smush-webp/ tree
$content_dir = trailingslashit( WP_CONTENT_DIR );
$content_url = trailingslashit( WP_CONTENT_URL );
$smush_webp_path = null;
$smush_webp_url = null;
if ( strpos( $path, $content_dir . 'uploads/' ) === 0 ) {
$rel = substr( $path, strlen( $content_dir . 'uploads/' ) );
$smush_webp_path = $content_dir . 'smush-webp/' . $rel . '.webp';
$smush_webp_url = $content_url . 'smush-webp/' . $rel . '.webp';
}
if ( is_readable( $webp_alongside ) ) {
$result['webp'] = $url_clean . '.webp';
} elseif ( $smush_webp_path && is_readable( $smush_webp_path ) ) {
$result['webp'] = $smush_webp_url;
}
if ( is_readable( $avif_alongside ) ) {
$result['avif'] = $url_clean . '.avif';
}
return self::$sibling_cache[ $url ] = $result;
}
/**
* Convert a URL to its absolute filesystem path, or null if the URL is
* external / can't be resolved to a local file.
*/
private static function url_to_path( string $url ): ?string {
$uploads = wp_get_upload_dir();
$content_dir = trailingslashit( WP_CONTENT_DIR );
$content_url = trailingslashit( WP_CONTENT_URL );
// Strip protocol-relative
if ( strpos( $url, '//' ) === 0 ) {
$url = 'https:' . $url;
}
// Site root (handles http vs https mismatches between admin + frontend)
$home = home_url();
$home_alt = preg_replace( '#^https?://#', '', $home );
// Match wp-content/ specifically (covers themes + plugins + uploads)
if ( strpos( $url, $content_url ) === 0 ) {
return $content_dir . substr( $url, strlen( $content_url ) );
}
// Match the uploads URL even if served from a CDN-prefixed URL that
// rewrites only the uploads part (we don't use a CDN but sites might)
if ( strpos( $url, $uploads['baseurl'] ) === 0 ) {
return $uploads['basedir'] . substr( $url, strlen( $uploads['baseurl'] ) );
}
// Site-relative
if ( $url !== '' && $url[0] === '/' && strpos( $url, '//' ) !== 0 ) {
// /wp-content/uploads/2026/02/foo.jpg
$content_path = '/' . wp_basename( $content_dir ) . '/';
if ( strpos( $url, $content_path ) === 0 ) {
return $content_dir . substr( $url, strlen( $content_path ) );
}
}
return null;
}
}