initial commit

2 years ago · 6e82372485
67 changed files with 10813 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+/target
--- a/Cargo.lock
+++ b/Cargo.lock
@ -0,0 +1,21 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "artspace"
+version = "0.1.0"
+dependencies = [
+ "zune-jpeg",
+]
+
+[[package]]
+name = "zune-core"
+version = "0.5.0-rc1"
+
+[[package]]
+name = "zune-jpeg"
+version = "0.5.0-rc1"
+dependencies = [
+ "zune-core",
+]
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,7 @@
+[package]
+name = "artspace"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+zune-jpeg = {path = "third_party/zune-jpeg"}
--- a/README.md
+++ b/README.md
@ -0,0 +1,14 @@
+artspace
+========
+
+Tabs or spaces, the eternal question. Why limit yourself to whitespace that doesn't even make a visual difference?
+
+Instead, turn that wasted whitespace into artspace!
+
+## Usage
+
+```
+cargo run --release -- test_image.jpg src\main.rs artspace.rs
+```
+
+Note: Only compatible with languages with `/* this style of multi-line comment */`.
--- a/artspace.rs
+++ b/artspace.rs
@ -0,0 +1,132 @@
+/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠗                 ⣼⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿             ⣀⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠇           ⢀⣼⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠃          ⣀⣾⣿⣿⣿⣿⣿*/
+pub fn convert_bitmap_to_unicode(w: usize, h: usize, data: Vec<u8>) -> Vec<Vec<char>>/*⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷⠓          ⣈⣬⣿⣿⣿⣿⣿⣿⣿*/
+{/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠓                ⢀⣬⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠃             ⣸⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠁          ⣈⣾⣿⣿⣿⣿⣿⣿⣿⣿⡿⠳⠁          ⣈⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠃                ⣈⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠃             ⣀⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷          ⣈⣾⣿⣿⣿⣿⣿⣿⣿⣿⡷⠓          ⢀⣬⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿*/
+/**/const CHARS: [&str; 4] = [/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠃              ⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠓         ⣈⣾⣿⣿⣿⣿⣿⣿⣿⣿⠷⠁          ⣈⣮⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠓ */
+/*⣿⣿⣿⣿*/" ⠁⠂⠃⠄⠅⠆⠇⠈⠉⠊⠋⠌⠍⠎⠏⠐⠑⠒⠓⠔⠕⠖⠗⠘⠙⠚⠛⠜⠝⠞⠟⠠⠡⠢⠣⠤⠥⠦⠧⠨⠩⠪⠫⠬⠭⠮⠯⠰⠱⠲⠳⠴⠵⠶⠷⠸⠹⠺⠻⠼⠽⠾⠿",/* ⣈⣾⣿⣿⣿⣿⣿⣿⣿⡿⠓⠁         ⢀⣬⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷⠁  */
+/*⣿⣿⣿⣿*/"⡀⡁⡂⡃⡄⡅⡆⡇⡈⡉⡊⡋⡌⡍⡎⡏⡐⡑⡒⡓⡔⡕⡖⡗⡘⡙⡚⡛⡜⡝⡞⡟⡠⡡⡢⡣⡤⡥⡦⡧⡨⡩⡪⡫⡬⡭⡮⡯⡰⡱⡲⡳⡴⡵⡶⡷⡸⡹⡺⡻⡼⡽⡾⡿",/*⣾⣿⣿⣿⣿⣿⣿⣿⡿⠓          ⣈⣮⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷⠁    */
+/*⣿⣿⣿⣿*/"⢀⢁⢂⢃⢄⢅⢆⢇⢈⢉⢊⢋⢌⢍⢎⢏⢐⢑⢒⢓⢔⢕⢖⢗⢘⢙⢚⢛⢜⢝⢞⢟⢠⢡⢢⢣⢤⢥⢦⢧⢨⢩⢪⢫⢬⢭⢮⢯⢰⢱⢲⢳⢴⢵⢶⢷⢸⢹⢺⢻⢼⢽⢾⢿",/*⣿⣿⣿⣿⣿⣿⣿⠗          ⣈⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷⠁      */
+/*⣿⣿⣿⣿*/"⣀⣁⣂⣃⣄⣅⣆⣇⣈⣉⣊⣋⣌⣍⣎⣏⣐⣑⣒⣓⣔⣕⣖⣗⣘⣙⣚⣛⣜⣝⣞⣟⣠⣡⣢⣣⣤⣥⣦⣧⣨⣩⣪⣫⣬⣭⣮⣯⣰⣱⣲⣳⣴⣵⣶⣷⣸⣹⣺⣻⣼⣽⣾⣿",/*⣿⣿⣿⣿⣿⠷⠁         ⣈⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠳⠁        */
+/**/];/*            ⢀⣬⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣏                             ⣈⣾⣿⣿⣿⣿⣿⣿⣿⡿⠃         ⣈⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠷⠑           */
+/**/let bitchars = CHARS.iter().flat_map(|t| t.chars()).collect::<Vec<_>>();/*⣿⠷         ⢀⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠷⠑              */
+/*⣿⣿⣿⠿           ⢀⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡷⣷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣎⢌                     ⣼⣿⣿⣿⣿⣿⣿⣿⡿⠁        ⢀⣬⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷⠓⠁                ⢀*/
+/**/let px = |i: usize, j: usize| if i < w && j < h {data[j * w + i]} else {0};/*    ⢀⣬⣿⣿⣿⣿⣿⣿⣿⣿⡿⠓                  ⢀⣬⣿*/
+/*⣿⠿          ⣨⣿⣿⣿⣿⣿⣿⣿⡿⠓       ⠰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯                 ⣠⣿⣿⣿⣿⣿⣿⣿⠷        ⣀⣾⣿⣿⣿⣿⣿⣿⣿⣿⠗                   ⣬⣿⣿⣿*/
+/**/let mut output = vec![];/*  ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⢏                ⣿⣿⣿⣿⣿⣿⣿⠿         ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠌                  ⣰⣿⣿⣿⣿*/
+/*⣿          ⣼⣿⣿⣿⣿⣿⣿⣿⠇          ⡰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿               ⣠⣿⣿⣿⣿⣿⣿⣿⠏         ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯⠈                ⣀⣿⣿⣿⣿⣿*/
+/**/for j in (0..h).step_by(4)/* ⣷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠌              ⣸⣿⣿⣿⣿⣿⣿⣿⠏         ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣎⠈             ⢀⣿⣿⣿⣿⣿⣿*/
+/**/{/*     ⣼⣿⣿⣿⣿⣿⣿⣿⠟            ⣰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏              ⣿⣿⣿⣿⣿⣿⣿⣿⠇         ⠰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣎⢌           ⣰⣿⣿⣿⣿⣿⣿*/
+/*    */let mut line = vec![];/*  ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏             ⣰⣿⣿⣿⣿⣿⣿⣿⣿           ⠰⣷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯⣎⢌         ⣷⣿⣿⣿⣿⣿*/
+/*    */for i in (0..w).step_by(2)/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏            ⢀⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏            ⠱⡳⡷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣎⠈       ⠱⣷⣿⣿⣿*/
+/*    */{/*⣿⣿⣿⣿⣿⣿⣿⣿⠿              ⠰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏           ⢀⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏                ⠑⠱⠳⡷⣷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣎        ⠐⡳⣿*/
+/*        */let mut index = 0;/*   ⣳⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏          ⢀⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏                     ⠐⠱⣷⣿⣿⣿⣿⣿⣿⣿⣿⣯         ⠐*/
+/*       ⣸*/index |= if px(i+0, j+0) < 128 {0} else {1 << 0};/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏                        ⠱⣷⣿⣿⣿⣿⣿⣿⠷          */
+/*      ⢀⣿*/index |= if px(i+0, j+1) < 128 {0} else {1 << 1};/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠇                          ⣳⣿⣿⣿⠷⠓           */
+/*      ⣸⣿*/index |= if px(i+0, j+2) < 128 {0} else {1 << 2};/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿                ⢈⣌⣬⣮⣮⣮⣎⣌⣌⣌⣌⣾⣿⡿⠁             */
+/*     ⣀⣿⣿*/index |= if px(i+0, j+3) < 128 {0} else {1 << 3};/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠃             ⢀⣬⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⢌             */
+/*    ⣈⣿⣿⣿*/index |= if px(i+1, j+0) < 128 {0} else {1 << 4};/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⠗             ⣈⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯⣌⠈          */
+/*  ⢀⣼⣿⣿⣿⣿*/index |= if px(i+1, j+1) < 128 {0} else {1 << 5};/*⣿⣿⣿⣿⣿⣿⣿⡿⠃             ⣼⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣎⢌        */
+/*⢌⢈⣿⣿⣿⣿⣿⣿*/index |= if px(i+1, j+2) < 128 {0} else {1 << 6};/*⣿⣿⣿⣿⣿⣿⠷              ⣼⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡷⡷⡷⡷⣿⣿⣿⣿⣿⣯⣎⢈    ⣴*/
+/*⣿⣿⣿⣿⣿⣿⣿⣿*/index |= if px(i+1, j+3) < 128 {0} else {1 << 7};/*⣿⣿⣿⣿⡷⠃              ⣨⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠓      ⠑⡳⣿⣿⣿⣿⣿⣿⣮⣮⣮⣿*/
+/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠿                                ⠐⡳⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷⠑               ⣀⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠿⠁         ⠰⣿⣿⣿⣿⣿⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿⣿⣿⣿⣿*/line.push(bitchars[index]);/*        ⠱⡷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡷⠳⠑                ⣈⣬⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷            ⣿⣿⣿⣿⣿⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿*/}/*⠟             ⣾⣿⣿⣿⣿⣿⣿⣿⣿⣎⠈               ⠐⠱⡷⣷⣿⡷⡷⠳⠑⠁                ⢀⣬⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠃            ⢀⣿⣿⣿⣿⣿⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿*/output.push(line);/*⣿⣿⣿⣿⣿⣿⣿⣿⣎⢌                                    ⢀⣌⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠗              ⣰⣿⣿⣿⣿⣿⣿⣿⣿⣿*/
+/**/}/*⣿⣿⣿⣿             ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣮⣌⢈⠈                             ⣈⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠳⠁              ⢀⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿⣿⣿⣿⣿⣿            ⣀⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯⣮⣌⢈                       ⣈⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠷⠑                ⣀⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿*/
+/**/output/*           ⣰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯⢌                 ⠈⢀⣬⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷⠑                 ⢀⣬⣿⣿⣿⣿⣿⣿⣿⣿⡿⠷⠓⣳*/
+}/*⣿⣿⣿⣿⣿⣿⣿⣿⠏           ⣰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷⠓⠑⠑⠑⡳⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯⠈            ⢀⡈⣚⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡷⠓                  ⢈⣬⣿⣿⣿⣿⣿⣿⣿⣿⣿⠓    */
+/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣏           ⣰⣿⣿⣿⣿⣿⣿⣿⣿⣿⠃      ⠐⡳⣿⣿⣿⣿⣿⣿⣿⣿⣿⣎⠈         ⢈⣽⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠳⠁                 ⢈⣬⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠁     */
+/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠌          ⣰⣿⣿⣿⣿⣿⣿⣿⣿⠟         ⠐⡳⣿⣿⣿⣿⣿⣿⣿⣿⣿⣎⣌⢈⢈⢈⣌⣬⣮⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷⠓                 ⢀⣈⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠁      */
+fn read_jpeg_to_bitmap(file: &str) -> (usize, usize, Vec<u8>)/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷⠁                 ⢀⣬⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠿        */
+{/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠌          ⣷⣿⣿⣿⣿⣿⣿⠿               ⠱⣷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷⠁                  ⣈⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠁        */
+/**/use zune_jpeg::zune_core::{colorspace::ColorSpace, options::DecoderOptions, bytestream::ZCursor};/*⣿⣿⣿⣿⣿⠁         */
+/**/let data = std::fs::read(file).unwrap();/*    ⣳⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠿⠁                  ⢀⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏          */
+/**/let options = DecoderOptions::default().jpeg_set_out_colorspace(ColorSpace::Luma);/*⣬⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿           */
+/**/let mut decoder = zune_jpeg::JpegDecoder::new_with_options(ZCursor::new(&data), options);/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠁           */
+/**/let pixels = decoder.decode().unwrap();/*      ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠿⠁                  ⣬⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠁            */
+/**/let (w, h) = decoder.dimensions().unwrap();/*  ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿                  ⢀⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠗             */
+/**/(w, h, pixels)/*⣯⣌⣌⣬⣾⣿⣿⣿⠎  ⢬⣭⣿⣿⣿⣿⣿⣿⣿⣿⣿         ⣳⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏                  ⣰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏              */
+}/*        ⠰⣷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷⠁ ⣐⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿         ⣰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿                  ⠰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏              */
+/*⣯⠌        ⠐⣷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠁   ⣰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠿         ⠰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠈                  ⣳⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏              */
+/*⣿⣿⠌        ⠐⣷⣿⣿⣿⣿⣿⣿⣿⣿⣿⠿      ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏          ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣏                  ⠰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯              */
+fn main()/*   ⠰⣿⣿⣿⣿⣿⣿⣿⣿⣿⠁      ⡰⣿⣿⣿⣿⣿⣿⣿⣿⣿           ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠌                  ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿              */
+{/*⣿⣿⠏         ⣱⣿⣿⣿⣿⣿⣿⣿⠿        ⣳⣿⣿⣿⣿⣿⣿⣿⠿          ⢀⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠌                 ⣹⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣏             */
+/**/match std::env::args().collect::<Vec<_>>().as_slice() {/*⣿⣿⣿⣿⣿⣿⣿⠌                ⣰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⢎            */
+/*⣿⣿⣿⣿*/[_, bitmap_file, source_file, output_file] =>/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⢌               ⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⢎           */
+/*⣿⣿⣿⣿*/{/*     ⠰⣿⣿⣿⣿⣿⣿⣿⣏                        ⣨⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣎⠈             ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯⣎⠈        */
+/*⣿⣿⣿⣿⣿   */let (w, h, pixels) = read_jpeg_to_bitmap(&bitmap_file);/*⣿⣿⣯⢌           ⣰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯⣎⣌⢌⢈⢈⢈⢈*/
+/*⣿⣿⣿⣿⣿⠏         ⡰⣿⣿⣿⣿⣿⣿⣿⣿⣯⢌                   ⣼⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷⠑⠑⠑⠱⣷⣿⣿⣿⣿⣿⣿⣿⣯⢌       ⢀⣀⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿⣿⣿  */let char_bitmap = convert_bitmap_to_unicode(w, h, pixels);/*⣿⣿⣿⣯⣎⢈   ⢀⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡷⠳⠳⠑⠑⠑⠑⠑⠑⠳⡳⡷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿⣿⣿⢏         ⠐⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣎               ⣰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿         ⠱⣷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡷⠓              ⠑⠳⣷⣿⣿⣿⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿⣿⣿⣿⠌*/let source = std::fs::read_to_string(source_file).unwrap();/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷⠁                   ⠐⠳⣷⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿⣿⣿⣿⣯          ⡱⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣎             ⡰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⢏           ⡱⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠓                        ⠐⠱⠳⡷*/
+/*⣿⣿⣿⣿⣿⣿⣿⣿*/let max_width = 120;/*⣏             ⠰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿            ⠰⣷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠃                             */
+/*⣿⣿⣿⣿⣿⣿⣿⣿⣿           ⠰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯⠈            ⠰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏            ⠐⡳⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠁                              */
+/*⣿⣿⣿⣿⣿⣿⣿⣿*/let mut modified_lines = vec![];/*    ⠐⡳⣿⣿⣿⣿⣿⣿⣿⣿⠁              ⠱⡳⣿⣿⣿⣿⣿⣿⣿⡿⠓                ⢈⢈⢈⢈⢈⢈⢈⠈        */
+/*⣿⣿⣿⣿⣿⣿⣿⣿*/let mut buffer = Vec::with_capacity(1024);/*⣿⣿⣿⠿                  ⠑⠱⠳⠳⠓⠁            ⢈⣌⣬⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣮⣌⠈    */
+/*⣿⣿⣿⣿⣿⣿⣿⣿*/let mut row = 0;/*⣿⣿⣿⣿⣿⣿⣿⣿⣎                ⠐⠑⠑⠁                                 ⢀⣈⣮⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣮⢌⠈ */
+/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⢏           ⣸⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣏⠈                       ⢀⢈⣌⣮⣮⣎⣌⢈                  ⣈⣮⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣮*/
+/*⣿⣿⣿⣿⣿⣿⣿⣿*/for line in source.lines()/*⣿⢎                     ⣬⣿⣿⣿⣿⣿⣿⣿⣿⣿⣎⠈            ⢀⣬⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿⣿⣿⣿⣿*/{/*    ⢀⣜⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯⢌                  ⣼⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣮⢌       ⢀⣈⣮⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡷⠷⠳⠳⠳⠳⠳⠳⠳⡷⡷⣿⣿⣿⣿⣿⣿⣿⣿⣿*/
+/*⣿⣿⡷⡷⡷⡷⣿⣿⣿⣿⣿⣿*/buffer.clear();/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯⢌               ⣸⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣮⣮⣌⣬⣮⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠷⠓⠁             ⠐⠱⠳⡷⣿⣿⣿⣿*/
+/*⠓     ⠐⣷⣿⣿⣿⣿*/buffer.extend(line.chars());/*⣯⢌⠈           ⣨⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷⠑                      ⠑⠱⠳*/
+/*        ⡳⣿⣿⣿*/if buffer.len() < max_width {/*⣿⣿⣮⢌⠈       ⣨⣿⣿⣿⣿⣿⣿⣿⣿⣿⡷⠳⠳⠑⠑⠳⡷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷⠁                           */
+/*         ⠱⣷⣿⣿⣿⣿⣿*/let needed = max_width - buffer.len();/*⣿⣿⣿⣿⣿⣿⣿⠷⠁       ⠐⠳⣷⣿⣿⣿⣿⣿⣿⣿⡿⠳⠁                             */
+/*            ⠑⠑⠑ */buffer.extend(core::iter::repeat(' ').take(needed));/*     ⠐⠱⠳⠳⠳⠳⠑         ⢈⣈⣌⣌⣬⣮⣎⣌⣌⢈             */
+/*            */}/*                        ⠱⡷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠗                           ⣈⣮⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯⣎⢌          */
+/* ⢈⣌⣬⣎⢌⠈             ⢈⢈⢈⢈⢈⢈⢈⠈               ⠐⠱⡷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠃                          ⣈⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣮⢌⠈      */
+/*⣿⣿⣿⣿⣿⣿⣿⠌    */let mut i = 0;/*⣮⣎⣌⢈             ⠱⡳⣷⣿⣿⣿⣿⣿⣿⣿⣿⣿⠿⠁        ⢈⣌⣌⣌⠈             ⣬⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣮⣌⢈⢈⢈*/
+/*⣿⣿⣿⣿⣿⣿⣿⣿⢎   */for j in 0..buffer.len()/*          ⠐⠱⡳⣷⣿⣿⣿⡿⠓        ⣈⣾⣿⣿⣿⣿⣿⣯⢌         ⢀⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯⣌⢈*/{/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯⣌⠈              ⠁          ⣬⣿⣿⣿⣿⣿⣿⣿⣿⣿⣎⢈     ⢀⣬⣿⣿⣿⣿⣿⣿⣿⡿⠷⠳⠓⠑⠑⠑⠑⠑⠑⠱⡳⣷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿*/if !buffer[j].is_whitespace()/*               ⢀⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯⣎⣌⣌⣾⣿⣿⣿⣿⣿⣿⣿⠷⠁             ⡳⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿*/
+/*⠳⠓⠑⠑⣷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿*/{/*         ⠐⠱⠳⡷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣎⢌                ⣨⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷⠁                ⠰⣷⣿⣿⣿⣿⣿⣿⣿⣿*/
+/*     ⣳⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠟  */if j - i > 3/*⡷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣮⣌⢈          ⣈⣾⣿⣿⣿⣿⣿⡷⡷⣷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷⠁                   ⠐⡳⣿⣿⣿⣿⣿⣿⣿*/
+/*     ⣰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏  */{/*            ⠐⡳⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯⣎⢌⠈   ⢈⣬⣾⣿⣿⣿⣿⠷⠑    ⣱⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷⠁                       ⠐⣳⣿⣿⣿⣿⣿*/
+/*     ⣰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯      */let bmp_row = &char_bitmap[row];/*⣿⣿⣿⡿⠃       ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠁                           ⡱⣿⣿⣿⣿*/
+/*     ⣼⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣏     */for k in i..j {/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠿⠁        ⣿⣿⣿⣿⣿⣿⣿⣿⣿⠿                              ⠰⣷⣿⣿*/
+/*    ⣨⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣏        */buffer[k] = bmp_row[k % bmp_row.len()];/*⣰⣿⣿⣿⣿⣿⣿⣿⣿⠗                                ⠐⣷⣿*/
+/*   ⣬⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯⠌  */}/*               ⠰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠇         ⣿⣿⣿⣿⣿⣿⣿⣿⠿         ⣈⣌⢈                      ⠐⡳*/
+/*⢀⣌⣾⣿⣿⣿⣿⠷⠑ ⠐⡳⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⢎ */buffer[i  ] = '/';/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿         ⣰⣿⣿⣿⣿⣿⣿⣿⡿        ⣀⣾⣿⣿⣿⣯⠈                      */
+/*⣿⣿⣿⣿⣿⠿⠁     ⣱⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣎*/buffer[i+1] = '*';/*⣳⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏         ⣿⣿⣿⣿⣿⣿⣿⣿⠃       ⣀⣿⣿⣿⣿⣿⣿⣿⠈                     */
+/*⣿⣿⣿⣿⠗        ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿*/buffer[j-1] = '/';/*⡰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏        ⣠⣿⣿⣿⣿⣿⣿⣿⠿       ⣀⣿⣿⣿⣿⣿⣿⣿⣿⣏                     */
+/*⣿⣿⣿⠿         ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿*/buffer[j-2] = '*';/* ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏        ⣰⣿⣿⣿⣿⣿⣿⣿⠇       ⣸⣿⣿⣿⣿⣿⣿⣿⣿⣿⠎                    */
+/*⣿⣿⡿          ⣿⣿⣿⣿⣿⣿⣿*/}/*                      ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏        ⣸⣿⣿⣿⣿⣿⣿⣿        ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏                    */
+/*⣿⣿⠃          ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿                     ⣀⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏        ⣿⣿⣿⣿⣿⣿⣿⠿       ⣀⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏                    */
+/*⣿⠿           ⣿⣿⣿⣿⣿⣿⣿*/i = j + 1;/*            ⣰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏        ⣰⣿⣿⣿⣿⣿⣿⠏       ⣰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠁                    */
+/*⣿⠃          ⣠⣿⣿⣿*/}/*⣿⣿⣿⣿                     ⣰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏        ⣿⣿⣿⣿⣿⣿⣿⠏       ⣰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿                     */
+/*⣿⠈          */}/*⣿⣿⣿⣿⣿⣿⣿⣿⣏⠈                   ⣰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏        ⣳⣿⣿⣿⣿⣿⣿⠏       ⡰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣌⢈                   */
+/*⣿⣯⠈          ⠐⣷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣎⠈                  ⣳⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏        ⠰⣿⣿⣿⣿⣿⣿⣯⠈       ⠱⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣎⠈                */
+/*⣿⣿⣿⠌        */let j = buffer.len();/*           ⣷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏         ⣷⣿⣿⣿⣿⣿⣿⣏        ⠰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⢌               */
+/*⣿⣿⣿⣿⠈       */if j - i > 3/*⣿⣿⣿⣎⠈                ⣳⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏         ⣰⣿⣿⣿⣿⣿⣿⣿⢎        ⠰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠌              */
+/*⣿⣿⣿⣿⣏       */{/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣎⢈               ⠱⣷⣿⣿⣿⣿⣿⣿⣿⣿⠃         ⣰⣿⣿⣿⣿⣿⣿⣿⣿⠌        ⠐⣷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠯              */
+/*⣿⣿⣿⣿⣿⢎         ⣰*/let bmp_row = &char_bitmap[row];/*⠱⣷⣿⣿⣿⣿⣿⠓          ⣰⣿⣿⣿⣿⣿⣿⣿⣿⣿⠈         ⡳⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠃         ⢈⣬⣮⣿⣿*/
+/*⣳⣿⣿⣿⣿⣿⣎⠈     ⢀⣀⣿*/for k in i..j {/*⣿⣿⣯⢌               ⠐⠳⠳⠳⠁           ⣰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯          ⠐⠳⣷⣿⣿⣿⣿⣿⠷⠁       ⢀⣬⣾⣿⣿⣿⣿⣿*/
+/* ⣿⣿⣿⣿⣿⣿⣿⣯⣌⢌⢈⣌⣬⣿⣿⣿⣿⣿⣿*/buffer[k] = bmp_row[k % bmp_row.len()];/*       ⣸⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣏             ⠑⠑⠁         ⣈⣾⣿⣿⣿⣿⣿⣿⣿*/
+/*⣀⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿*/}/*⠁       ⠑⠱⡷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣎⠈                         ⣀⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿                        ⣨⣿⣿⣿⣿⣿⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿*/buffer[i  ] = '/';/*⣿⣿⣿⣿⣿⣿⣎⢈                      ⢀⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠈                     ⢈⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿*/buffer[i+1] = '*';/*⣿⣿⣿⣿⣿⣿⣿⣿⣯⢌                   ⣀⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏                 ⢀⢈⣬⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠓ */buffer[j-1] = '/';/*⡳⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯⢌⠈              ⢀⣬⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣏              ⣈⣮⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠟  */buffer[j-2] = '*';/* ⠐⠱⡷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣮⣎⣌⢈⢈       ⢀⣬⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠌            ⣠⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷⠑⠑⠱⣷⣿⣿⣿⣿⣿*/
+/* ⠱⣷⣿⣿⣿⣿⣿⣿⣿⣿⣿*/}/*                         ⠐⡳⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣮⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯⠌           ⣰⣿⣿⣿⣿⣿⣿⣿⡿⠁     ⣿⣿⣿⣿⣿*/
+/*   ⡳⣿⣿⣿⣿⣿⣿⣿⣿⣿⣏               ⢀⣈⣌⣌⣌⢈         ⠐⡳⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠓⠑⠑⠑⠑⠳⡷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣎⠈         ⡰⣿⣿⣿⣿⣿⣿⣿⠃      ⣰⣿⣿⣿⣿*/
+/*    ⡱⣿⣿⣿⣿⣿⣿⣿*/modified_lines.push(buffer.iter().collect::<String>());/* ⠐⠱⣷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣎⠈        ⣳⣿⣿⣿⣿⣿⣿       ⣰⣿⣿⣿⣿*/
+/*     ⣱⣿⣿⣿⣿⣿⣿*/row += 1;/*   ⣸⣿⣿⣿⣿⣿⣿⣿⣿⣯⠈         ⣳⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠃            ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⢎       ⠐⣿⣿⣿⣿⣿⣿       ⣰⣿⣿⣿⣿*/
+/*     ⣿⣿⣿*/}/*⣿⣿            ⣀⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣏          ⣷⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠓             ⢀⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿       ⢀⣿⣿⣿⣿⣿⠏       ⣰⣿⣿⣿⣿*/
+/*    ⣼⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏           ⣀⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠎         ⠰⣿⣿⣿⣿⣿⣿⣿⠷⠁             ⡈⣚⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠁       ⣰⣿⣿⣿⣿⣿⠁       ⠐⣿⣿⣿⣿*/
+/*   ⣀⣿⣿⣿⣿*/let new_source = modified_lines.join("\n");/*⠑              ⡀⣚⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠓         ⣾⣿⣿⣿⣿⣿        ⣰⣿⣿⣿⣿*/
+/*  ⢀⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⠟           ⣸⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠎                           ⢀⣌⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠓          ⣸⣿⣿⣿⣿⣿⣿        ⣰⣿⣿⣿⣿*/
+/*  ⣼⣿⣿⣿⣿⣿*/println!("{new_source}");/*⣿⣿⣿⣯                        ⢀⣈⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠓           ⣼⣿⣿⣿⣿⣿⣿⠿        ⣰⣿⣿⣿⣿*/
+/* ⣸⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠟           ⣼⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠌                     ⢈⣬⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷⠁           ⢀⣾⣿⣿⣿⣿⣿⣿⣿⠇        ⣰⣿⣿⣿⣿*/
+/*⣀⣿⣿⣿⣿⣿⣿⣿*/std::fs::write(output_file, new_source).unwrap();/*⣌⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠳⠁            ⣈⣿⣿⣿⣿⣿⣿⣿⣿⡿         ⣰⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿*/}/*⣿⣿⠟           ⣠⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠎               ⢀⣬⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠓             ⢀⣼⣿⣿⣿⣿⣿⣿⣿⣿⡿⠁         ⣿⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿*/[path] =>/*      ⣰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣏              ⣨⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷⠑              ⣀⣾⣿⣿⣿⣿⣿⣿⣿⣿⡿⠁         ⣠⣿⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿*/{/*⣿⠟            ⣰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿             ⣼⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷⠁               ⣀⣿⣿⣿⣿⣿⣿⣿⡿⠷⠓           ⣼⣿⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿⣿⣿⣿⣿*/println!("usage: {path} art.jpg input_src output_src");/*⣿⡿⠳⠁                 ⡰⣷⡷⡷⠷⠳⠑⠁             ⣨⣿⣿⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿*/}/*⠇               ⠑⠑     ⠐⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠌           ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡿⠓                                        ⣨⣿⣿⣿⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿*/_ =>/*                     ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏          ⣰⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠷         ⢀⣌⣬⣮⣾⣿⣿⣿⣿⣿⠌                    ⢀⣼⣿⣿⣿⣿⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿*/{/*                        ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏          ⠰⣿⣿⣿⣿⣿⣿⣿⣿⠷⠁       ⢀⣬⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿                   ⢀⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿*/}/*                        ⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏           ⣷⣿⣿⣿⣿⣿⠷⠁       ⢀⣬⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⢎                  ⣼⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿*/
+/**/}/*⣿⣿                         ⣠⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠏            ⠱⠳⠳⠑         ⣬⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⢎                 ⠱⣷⣿⣿⣿⣿⣿⣿⣿⣿⣿*/
+/*⣿⣿⣿⣿⣿⣿⣿⠌                        ⣼⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠌                        ⡳⡷⣷⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣏⠈                 ⠑⡳⣷⣿⣿⣿⣿⣿⣿*/
+}/*⣿⣿⣿⣿⣿⣿⣯⠈                     ⣈⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣯⠈                ⢀⢈⢈⢈       ⠑⠱⡳⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠌                   ⠑⠳⡷⣿⣿⣿*/
--- a/src/main.rs
+++ b/src/main.rs
@ -0,0 +1,132 @@
+
+pub fn convert_bitmap_to_unicode(w: usize, h: usize, data: Vec<u8>) -> Vec<Vec<char>>
+{
+
+    const CHARS: [&str; 4] = [
+        " ⠁⠂⠃⠄⠅⠆⠇⠈⠉⠊⠋⠌⠍⠎⠏⠐⠑⠒⠓⠔⠕⠖⠗⠘⠙⠚⠛⠜⠝⠞⠟⠠⠡⠢⠣⠤⠥⠦⠧⠨⠩⠪⠫⠬⠭⠮⠯⠰⠱⠲⠳⠴⠵⠶⠷⠸⠹⠺⠻⠼⠽⠾⠿",
+        "⡀⡁⡂⡃⡄⡅⡆⡇⡈⡉⡊⡋⡌⡍⡎⡏⡐⡑⡒⡓⡔⡕⡖⡗⡘⡙⡚⡛⡜⡝⡞⡟⡠⡡⡢⡣⡤⡥⡦⡧⡨⡩⡪⡫⡬⡭⡮⡯⡰⡱⡲⡳⡴⡵⡶⡷⡸⡹⡺⡻⡼⡽⡾⡿",
+        "⢀⢁⢂⢃⢄⢅⢆⢇⢈⢉⢊⢋⢌⢍⢎⢏⢐⢑⢒⢓⢔⢕⢖⢗⢘⢙⢚⢛⢜⢝⢞⢟⢠⢡⢢⢣⢤⢥⢦⢧⢨⢩⢪⢫⢬⢭⢮⢯⢰⢱⢲⢳⢴⢵⢶⢷⢸⢹⢺⢻⢼⢽⢾⢿",
+        "⣀⣁⣂⣃⣄⣅⣆⣇⣈⣉⣊⣋⣌⣍⣎⣏⣐⣑⣒⣓⣔⣕⣖⣗⣘⣙⣚⣛⣜⣝⣞⣟⣠⣡⣢⣣⣤⣥⣦⣧⣨⣩⣪⣫⣬⣭⣮⣯⣰⣱⣲⣳⣴⣵⣶⣷⣸⣹⣺⣻⣼⣽⣾⣿",
+    ];
+    let bitchars = CHARS.iter().flat_map(|t| t.chars()).collect::<Vec<_>>();
+
+    let px = |i: usize, j: usize| if i < w && j < h {data[j * w + i]} else {0};
+
+    let mut output = vec![];
+
+    for j in (0..h).step_by(4) 
+    {
+        let mut line = vec![];
+        for i in (0..w).step_by(2)
+        {
+            let mut index = 0;
+            index |= if px(i+0, j+0) < 128 {0} else {1 << 0};
+            index |= if px(i+0, j+1) < 128 {0} else {1 << 1};
+            index |= if px(i+0, j+2) < 128 {0} else {1 << 2};
+            index |= if px(i+0, j+3) < 128 {0} else {1 << 3};
+            index |= if px(i+1, j+0) < 128 {0} else {1 << 4};
+            index |= if px(i+1, j+1) < 128 {0} else {1 << 5};
+            index |= if px(i+1, j+2) < 128 {0} else {1 << 6};
+            index |= if px(i+1, j+3) < 128 {0} else {1 << 7};
+
+            line.push(bitchars[index]);
+        }
+        output.push(line);
+    }
+    
+    output
+}
+
+
+fn read_jpeg_to_bitmap(file: &str) -> (usize, usize, Vec<u8>)
+{
+    use zune_jpeg::zune_core::{colorspace::ColorSpace, options::DecoderOptions, bytestream::ZCursor};
+    let data = std::fs::read(file).unwrap();
+    let options = DecoderOptions::default().jpeg_set_out_colorspace(ColorSpace::Luma);
+    let mut decoder = zune_jpeg::JpegDecoder::new_with_options(ZCursor::new(&data), options);
+    let pixels = decoder.decode().unwrap();
+    let (w, h) = decoder.dimensions().unwrap();
+    (w, h, pixels)
+}
+
+
+fn main() 
+{
+    match std::env::args().collect::<Vec<_>>().as_slice() {
+        [_, bitmap_file, source_file, output_file] => 
+        {
+            let (w, h, pixels) = read_jpeg_to_bitmap(&bitmap_file);
+
+            let char_bitmap = convert_bitmap_to_unicode(w, h, pixels);
+
+            let source = std::fs::read_to_string(source_file).unwrap();
+
+            let max_width = 120;
+
+            let mut modified_lines = vec![];
+            let mut buffer = Vec::with_capacity(1024);
+            let mut row = 0;
+
+            for line in source.lines() 
+            {
+                buffer.clear();
+                buffer.extend(line.chars());
+                if buffer.len() < max_width {
+                    let needed = max_width - buffer.len();
+                    buffer.extend(core::iter::repeat(' ').take(needed));
+                }
+
+                let mut i = 0;
+                for j in 0..buffer.len() 
+                {
+                    if !buffer[j].is_whitespace() 
+                    {
+                        if j - i > 3 
+                        {
+                            let bmp_row = &char_bitmap[row];
+                            for k in i..j {
+                                buffer[k] = bmp_row[k % bmp_row.len()];
+                            }
+                            buffer[i  ] = '/';
+                            buffer[i+1] = '*';
+                            buffer[j-1] = '/';
+                            buffer[j-2] = '*';
+                        }
+
+                        i = j + 1;
+                    }
+                }
+
+                let j = buffer.len();
+                if j - i > 3 
+                {
+                    let bmp_row = &char_bitmap[row];
+                    for k in i..j {
+                        buffer[k] = bmp_row[k % bmp_row.len()];
+                    }
+                    buffer[i  ] = '/';
+                    buffer[i+1] = '*';
+                    buffer[j-1] = '/';
+                    buffer[j-2] = '*';
+                }
+
+                modified_lines.push(buffer.iter().collect::<String>());
+                row += 1;
+            }
+
+            let new_source = modified_lines.join("\n");
+
+            println!("{new_source}");
+
+            std::fs::write(output_file, new_source).unwrap();
+        }
+        [path] =>
+        {
+            println!("usage: {path} art.jpg input_src output_src");
+        }
+        _ => 
+        {
+        }
+    }
+
+}
--- a/test_image.jpg
+++ b/test_image.jpg
--- a/third_party/zune-core/CHANGELOG.md
+++ b/third_party/zune-core/CHANGELOG.md
@ -0,0 +1,22 @@
+## 0.2.14
+
+- Fixed building with no-std
+- Add `peek_at` and `pos` for writer
+- Make serde non default
+- Add option to make PNG add an alpha channel
+
+## 0.2.12
+
+- Add endianness conversion
+- Hide exposed values for EncoderOptions
+- Add Float32 bit depth
+- Remove support for BitDepth 10 and 12
+- Add bit_size method
+
+## 0.2.1
+
+Improve documentation on various parts
+
+## 0.2.0
+
+Initial version
--- a/third_party/zune-core/Cargo.toml
+++ b/third_party/zune-core/Cargo.toml
@ -0,0 +1,22 @@
+[package]
+name = "zune-core"
+version = "0.5.0-rc1"
+edition = "2021"
+description = "Core utilities for image processing in the zune family of crates"
+exclude = ["tests/"]
+repository = "https://github.com/etemesi254/zune-image"
+keywords = ["image"]
+categories = ["multimedia::images", "multimedia::encoding"]
+license = "MIT OR Apache-2.0 OR Zlib"
+
+[features]
+# When present, we can use std facilities to detect
+# if a specific feature exists
+# Not enabled by default. Other zune crates can enable dep:zune-core/std by default.
+# But if we enable it here, they can't disable it anymore.
+# See: https://github.com/rust-lang/cargo/issues/8366
+std = []
+
+[dependencies]
+log = { version = "0.4.17", optional = true }
+serde = { version = "1.0.52", optional = true }
--- a/third_party/zune-core/LICENSE-APACHE
+++ b/third_party/zune-core/LICENSE-APACHE
@ -0,0 +1 @@
+../../LICENSE-APACHE
--- a/third_party/zune-core/LICENSE-MIT
+++ b/third_party/zune-core/LICENSE-MIT
@ -0,0 +1 @@
+../../LICENSE-MIT
--- a/third_party/zune-core/LICENSE-ZLIB
+++ b/third_party/zune-core/LICENSE-ZLIB
@ -0,0 +1 @@
+../../LICENSE-ZLIB
--- a/third_party/zune-core/README.md
+++ b/third_party/zune-core/README.md
@ -0,0 +1,15 @@
+## Zune core
+
+Core primitives necessary for image manipulations
+
+This crate contains small set of primitives
+necessary for image manipulations which are shared among most   `zune-` family
+of decoders and encoders.
+
+### Items present
+
+Currently,it contains.
+
+- Colorspace definitions
+- Bit depth definitions.
+- Decoder and encoder options
--- a/third_party/zune-core/src/bit_depth.rs
+++ b/third_party/zune-core/src/bit_depth.rs
@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+//! Image bit depth, information and manipulations
+
+/// The image bit depth.
+///
+/// The library successfully supports depths up to
+/// 16 bits, as the underlying storage is usually a `u16`.
+///
+/// This allows us to comfortably support a wide variety of images
+/// e.g 10 bit av1, 16 bit png and ppm.
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+#[non_exhaustive]
+pub enum BitDepth {
+    /// U8 bit depth.
+    ///
+    /// Images with such bit depth use [`u8`] to store
+    /// pixels and use the whole range from 0-255.
+    ///
+    /// It is currently the smallest supported bit depth
+    /// by the library.
+    ///
+    /// For images with bit depths lower than this, they will be scaled
+    /// to this bit depth
+    Eight,
+    /// U16 bit depth
+    ///
+    /// Images with such bit depths use [`u16`] to store values and use the whole range
+    /// i.e 0-65535
+    ///
+    /// Data is stored and processed in native endian.
+    Sixteen,
+    /// Floating point 32 bit data, range is 0.0 to 1.0
+    ///
+    /// Uses f32 to store data
+    Float32,
+    /// Bit depth information is unknown
+    Unknown
+}
+
+/// The underlying bit representation of the image
+///
+/// This represents the minimum rust type that
+/// can be used to represent image data, required
+/// by `Channel` struct in zune-image
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+#[non_exhaustive]
+pub enum BitType {
+    /// Images represented using a [`u8`] as their
+    /// underlying pixel storage
+    U8,
+    /// Images represented using a [`u16`] as their
+    /// underlying pixel storage.
+    U16,
+    /// Images represented using a [`f32`] as their
+    /// underlying pixel storage
+    F32
+}
+
+impl BitType {
+    /// Return the equivalent of the image bit type's depth
+    pub fn to_depth(self) -> BitDepth {
+        match self {
+            BitType::U8 => BitDepth::Eight,
+            BitType::U16 => BitDepth::Sixteen,
+            BitType::F32 => BitDepth::Float32
+        }
+    }
+}
+
+impl Default for BitDepth {
+    fn default() -> Self {
+        Self::Unknown
+    }
+}
+
+impl BitDepth {
+    /// Get the max value supported by the bit depth
+    ///
+    /// During conversion from one bit depth to another
+    ///
+    /// larger values should be clamped to this bit depth
+    #[rustfmt::skip]
+    #[allow(clippy::zero_prefixed_literal)]
+    pub const fn max_value(self) -> u16
+    {
+        match self
+        {
+            Self::Eight => (1 << 08) - 1,
+            Self::Sixteen => u16::MAX,
+            Self::Float32 => 1,
+            Self::Unknown => 0,
+        }
+    }
+
+    /// Return the minimum number of bits that can be used to represent
+    /// each pixel in the image
+    ///
+    /// All bit depths below 8 return a bit type of `BitType::U8`.
+    ///  and all those above 8 and below 16 return a bit type of `BitType::SixTeen`
+    ///
+    /// # Returns
+    /// An enum whose variants represent the minimum size for an unsigned integer
+    /// which can store the image pixels without overflow
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use zune_core::bit_depth::{BitDepth, BitType};
+    /// assert_eq!(BitDepth::Eight.bit_type(),BitType::U8);
+    ///
+    /// assert_eq!(BitDepth::Sixteen.bit_type(),BitType::U16);
+    /// ```
+    ///
+    /// See also [size_of](BitDepth::size_of)
+    pub const fn bit_type(self) -> BitType {
+        match self {
+            Self::Eight => BitType::U8,
+            Self::Sixteen => BitType::U16,
+            Self::Float32 => BitType::F32,
+            Self::Unknown => panic!("Unknown bit type")
+        }
+    }
+    /// Get the number of bytes needed to store a specific bit depth
+    ///
+    ///  
+    /// # Example
+    /// For images less than or equal to 8 bits(1 byte), we can use a [`u8`] to store
+    /// the pixels, and a size_of [`u8`] is 1
+    ///
+    /// For images greater than 8  bits and less than 16 bits(2 bytes), we can use a [`u16`] to
+    /// store the pixels, a size_of [`u16`] is 2.
+    /// ```
+    /// use zune_core::bit_depth::BitDepth;
+    /// let depth = BitDepth::Sixteen;
+    /// // greater 12 bits is greater than 8 and less than 16
+    /// assert_eq!(depth.size_of(),2);
+    /// ```
+    pub const fn size_of(self) -> usize {
+        match self {
+            Self::Eight => core::mem::size_of::<u8>(),
+            Self::Sixteen => core::mem::size_of::<u16>(),
+            Self::Float32 => core::mem::size_of::<f32>(),
+            Self::Unknown => panic!("Unknown bit type")
+        }
+    }
+    pub const fn bit_size(&self) -> usize {
+        self.size_of() * 8
+    }
+}
+
+/// Byte endianness of returned samples
+/// this is useful when the decoder returns samples which span more
+/// than one byte yet the type returned is `&[u8]`
+///
+/// This helps you interpret how those bytes should be reconstructed
+/// to a higher order type
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+pub enum ByteEndian {
+    /// Little Endian byte-order
+    LE,
+    /// Big Endian byte-order
+    BE
+}
--- a/third_party/zune-core/src/bytestream.rs
+++ b/third_party/zune-core/src/bytestream.rs
@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+//! A simple implementation of a bytestream reader
+//! and writer.
+//!
+//! This module contains two main structs that help in
+//! byte reading and byte writing
+//!
+//! Useful for a lot of image readers and writers, it's put
+//! here to minimize code reuse
+pub use reader::ZReader;
+pub use traits::*;
+pub use writer::ZWriter;
+
+pub use crate::bytestream::reader::no_std_readers::*;
+//use crate::bytestream::reader::std_readers::*;
+pub use crate::bytestream::reader::ZByteIoError;
+
+mod reader;
+mod traits;
+mod writer;
--- a/third_party/zune-core/src/bytestream/reader.rs
+++ b/third_party/zune-core/src/bytestream/reader.rs
@ -0,0 +1,458 @@
+use alloc::string::String;
+use alloc::vec;
+use alloc::vec::Vec;
+use core::fmt::Formatter;
+
+pub(crate) mod no_std_readers;
+pub(crate) mod std_readers;
+use crate::bytestream::ZByteReaderTrait;
+
+/// Enumeration of possible methods to seek within an I/O object.
+///
+/// It is analogous to the [SeekFrom](std::io::SeekFrom) in the std library but
+/// it's here to allow this to work in no-std crates
+#[derive(Copy, PartialEq, Eq, Clone, Debug)]
+pub enum ZSeekFrom {
+    /// Sets the offset to the provided number of bytes.
+    Start(u64),
+
+    /// Sets the offset to the size of this object plus the specified number of
+    /// bytes.
+    ///
+    /// It is possible to seek beyond the end of an object, but it's an error to
+    /// seek before byte 0.
+    End(i64),
+
+    /// Sets the offset to the current position plus the specified number of
+    /// bytes.
+    ///
+    /// It is possible to seek beyond the end of an object, but it's an error to
+    /// seek before byte 0.
+    Current(i64)
+}
+
+impl ZSeekFrom {
+    /// Convert to [SeekFrom](std::io::SeekFrom) from the `std::io` library
+    ///
+    /// This is only present when std feature is present
+    #[cfg(feature = "std")]
+    pub(crate) fn to_std_seek(self) -> std::io::SeekFrom {
+        match self {
+            ZSeekFrom::Start(pos) => std::io::SeekFrom::Start(pos),
+            ZSeekFrom::End(pos) => std::io::SeekFrom::End(pos),
+            ZSeekFrom::Current(pos) => std::io::SeekFrom::Current(pos)
+        }
+    }
+}
+
+pub enum ZByteIoError {
+    /// A standard library error
+    /// Only available with the `std` feature
+    #[cfg(feature = "std")]
+    StdIoError(std::io::Error),
+    /// An error converting from one type to another
+    TryFromIntError(core::num::TryFromIntError),
+    /// Not enough bytes to satisfy a read
+    // requested, read
+    NotEnoughBytes(usize, usize),
+    /// The output buffer is too small to write the bytes
+    NotEnoughBuffer(usize, usize),
+    /// An error that may occur randomly
+    Generic(&'static str),
+    /// An error that occurred during a seek operation
+    SeekError(&'static str),
+    /// An error that occurred during a seek operation
+    SeekErrorOwned(String)
+}
+
+impl core::fmt::Debug for ZByteIoError {
+    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
+        match self {
+            #[cfg(feature = "std")]
+            ZByteIoError::StdIoError(err) => {
+                writeln!(f, "Underlying I/O error {}", err)
+            }
+            ZByteIoError::TryFromIntError(err) => {
+                writeln!(f, "Cannot convert to int {}", err)
+            }
+            ZByteIoError::NotEnoughBytes(expected, found) => {
+                writeln!(f, "Not enough bytes, expected {expected} but found {found}")
+            }
+            ZByteIoError::NotEnoughBuffer(expected, found) => {
+                writeln!(
+                    f,
+                    "Not enough buffer to write {expected} bytes, buffer size is {found}"
+                )
+            }
+            ZByteIoError::Generic(err) => {
+                writeln!(f, "Generic I/O error: {err}")
+            }
+            ZByteIoError::SeekError(err) => {
+                writeln!(f, "Seek error: {err}")
+            }
+            ZByteIoError::SeekErrorOwned(err) => {
+                writeln!(f, "Seek error {err}")
+            }
+        }
+    }
+}
+
+#[cfg(feature = "std")]
+impl From<std::io::Error> for ZByteIoError {
+    fn from(value: std::io::Error) -> Self {
+        ZByteIoError::StdIoError(value)
+    }
+}
+
+impl From<core::num::TryFromIntError> for ZByteIoError {
+    fn from(value: core::num::TryFromIntError) -> Self {
+        ZByteIoError::TryFromIntError(value)
+    }
+}
+
+impl From<&'static str> for ZByteIoError {
+    fn from(value: &'static str) -> Self {
+        ZByteIoError::Generic(value)
+    }
+}
+
+/// The image reader wrapper
+///
+/// This wraps anything that implements [ZByteReaderTrait] and
+/// extends the ability of the core trait methods by providing
+/// utilities like endian aware byte functions.
+///
+/// This prevents each implementation from providing its own
+pub struct ZReader<T: ZByteReaderTrait> {
+    inner:       T,
+    temp_buffer: Vec<u8>
+}
+
+impl<T: ZByteReaderTrait> ZReader<T> {
+    /// Create a new reader from a source
+    /// that implements the [ZByteReaderTrait]
+    pub fn new(source: T) -> ZReader<T> {
+        ZReader {
+            inner:       source,
+            temp_buffer: vec![]
+        }
+    }
+    /// Destroy this reader returning
+    /// the underlying source of the bytes
+    /// from which we were decoding
+    #[inline(always)]
+    pub fn consume(self) -> T {
+        self.inner
+    }
+    /// Skip ahead ignoring `num` bytes
+    ///
+    /// For more advanced seek methods see [Self::seek] that allows
+    /// moving around via more advanced ways
+    ///
+    /// # Arguments
+    ///  - num: The number of bytes to skip.
+    ///
+    /// # Returns
+    ///  - `Ok(u64)`: The new position from the start of the stream.
+    ///  - `Error` If something went wrong
+    #[inline(always)]
+    pub fn skip(&mut self, num: usize) -> Result<u64, ZByteIoError> {
+        self.inner.z_seek(ZSeekFrom::Current(num as i64))
+    }
+    /// Move back from current position to a previous
+    /// position
+    ///
+    /// For more advanced seek methods see [Self::seek] that allows
+    /// moving around via more advanced ways
+    ///
+    /// # Arguments
+    /// - `num`: Positions to move before the current cursor
+    ///
+    /// # Returns
+    ///  - `Ok(u64)`: The new position from the start of the stream.
+    ///  - `Error` If something went wrong
+    #[inline(always)]
+    pub fn rewind(&mut self, num: usize) -> Result<u64, ZByteIoError> {
+        self.inner.z_seek(ZSeekFrom::Current(-(num as i64)))
+    }
+    /// Move around a stream of bytes
+    ///
+    /// This is analogous to the [std::io::Seek] trait with the same ergonomics
+    /// only implemented to allow use in a `no_std` environment
+    ///
+    /// # Arguments
+    /// - `from`: The seek operation type.
+    ///
+    /// # Returns
+    ///  - `Ok(u64)`: The new position from the start of the stream.
+    ///  -  Error if something went wrong.
+    #[inline(always)]
+    pub fn seek(&mut self, from: ZSeekFrom) -> Result<u64, ZByteIoError> {
+        self.inner.z_seek(from)
+    }
+
+    /// Read a single byte from the underlying stream
+    ///
+    /// If an error occurs, it will return `0` as default output
+    /// hence it may be difficult to distinguish a `0` from the underlying source
+    /// and a `0` from an error.
+    /// For that there is [Self::read_u8_err]
+    ///
+    /// # Returns.
+    /// - The next byte on the stream.
+    ///  
+    #[inline(always)]
+    pub fn read_u8(&mut self) -> u8 {
+        self.inner.read_byte_no_error()
+    }
+
+    /// Read a single byte returning an error if the read cannot be satisfied
+    ///
+    /// # Returns
+    /// - `Ok(u8)`: The next byte
+    /// - Error if the byte read could not be satisfied   
+    #[inline(always)]
+    pub fn read_u8_err(&mut self) -> Result<u8, ZByteIoError> {
+        let mut buf = [0];
+        self.inner.read_const_bytes(&mut buf)?;
+        Ok(buf[0])
+    }
+
+    /// Look ahead position bytes and return a reference
+    /// to num_bytes from that position, or an error if the
+    /// peek would be out of bounds.
+    ///
+    /// This doesn't increment the position, bytes would have to be discarded
+    /// at a later point.
+    #[inline]
+    pub fn peek_at(&mut self, position: usize, num_bytes: usize) -> Result<&[u8], ZByteIoError> {
+        // short circuit for zero
+        // important since implementations like File will
+        // cause a syscall on skip
+        if position != 0 {
+            // skip position bytes from start
+            self.skip(position)?;
+        }
+        // resize buffer
+        self.temp_buffer.resize(num_bytes, 0);
+        // read bytes
+        match self.inner.peek_exact_bytes(&mut self.temp_buffer[..]) {
+            Ok(_) => {
+                // rewind back to where we were
+                if position != 0 {
+                    self.rewind(position)?;
+                }
+                Ok(&self.temp_buffer)
+            }
+            Err(e) => Err(e)
+        }
+    }
+    /// Read a fixed number of known bytes to a buffer and return the bytes or an error
+    /// if it occurred.
+    ///
+    /// The size of the `N` value must be small enough to fit the stack space otherwise
+    /// this will cause a stack overflow :)
+    ///
+    /// If you can ignore errors, you can use [Self::read_fixed_bytes_or_zero]
+    ///
+    /// # Returns
+    ///  - `Ok([u8;N])`: The bytes read from the source
+    ///  - An error if it occurred.
+    #[inline(always)]
+    pub fn read_fixed_bytes_or_error<const N: usize>(&mut self) -> Result<[u8; N], ZByteIoError> {
+        let mut byte_store: [u8; N] = [0; N];
+        match self.inner.read_const_bytes(&mut byte_store) {
+            Ok(_) => Ok(byte_store),
+            Err(e) => Err(e)
+        }
+    }
+    /// Read a fixed bytes to an array and if that is impossible, return an array containing
+    /// zeros
+    ///
+    /// If you want to handle errors, use [Self::read_fixed_bytes_or_error]
+    #[inline(always)]
+    pub fn read_fixed_bytes_or_zero<const N: usize>(&mut self) -> [u8; N] {
+        let mut byte_store: [u8; N] = [0; N];
+        self.inner.read_const_bytes_no_error(&mut byte_store);
+        byte_store
+    }
+
+    /// Move the cursor to a fixed position in the stream
+    ///
+    /// This will move the cursor to exacltly `position` bytes from the start of the buffer
+    ///
+    /// # Arguments
+    /// - `position`: The current position to move the cursor.
+    #[inline]
+    pub fn set_position(&mut self, position: usize) -> Result<(), ZByteIoError> {
+        self.seek(ZSeekFrom::Start(position as u64))?;
+
+        Ok(())
+    }
+
+    /// Return true if the underlying buffer can no longer produce bytes
+    ///
+    /// This call may be expensive depending on the underlying buffer type, e.g if
+    /// it's a file, we have to ask the os whether we have more contents, or in other words make a syscall.
+    ///
+    /// Use that wisely
+    ///
+    /// # Returns
+    ///  - `Ok(bool)`: True if we are in `EOF`, false if we can produce more bytes
+    ///  - Error if something went wrong
+    #[inline(always)]
+    pub fn eof(&mut self) -> Result<bool, ZByteIoError> {
+        self.inner.is_eof()
+    }
+
+    /// Return the current position of the inner reader or an error
+    /// if that occurred when reading.
+    ///
+    /// Like [eof](Self::eof), the perf characteristics may vary depending on underlying reader
+    ///
+    /// # Returns
+    /// - `Ok(u64)`: The current position of the inner reader
+    #[inline(always)]
+    pub fn position(&mut self) -> Result<u64, ZByteIoError> {
+        self.inner.z_position()
+    }
+
+    /// Read a fixed number of bytes from the underlying reader returning
+    /// an error if that can't be satisfied
+    ///
+    /// Similar to [std::io::Read::read_exact]
+    ///
+    /// # Returns
+    ///  - `Ok(())`: If the read was successful
+    ///  - An error if the read was unsuccessful including failure to fill the whole bytes
+    pub fn read_exact_bytes(&mut self, buf: &mut [u8]) -> Result<(), ZByteIoError> {
+        self.inner.read_exact_bytes(buf)
+    }
+
+    /// Read some bytes from the inner reader, and return number of bytes read
+    ///
+    /// The implementation may not read bytes enough to fill the buffer
+    ///
+    /// Similar to [std::io::Read::read]
+    ///
+    /// # Returns
+    /// - `Ok(usize)`: Number of bytes actually read to the buffer
+    /// - An error if something went wrong
+    pub fn read_bytes(&mut self, buf: &mut [u8]) -> Result<usize, ZByteIoError> {
+        self.inner.read_bytes(buf)
+    }
+}
+
+enum Mode {
+    // Big endian
+    BE,
+    // Little Endian
+    LE
+}
+macro_rules! get_single_type {
+    ($name:tt,$name2:tt,$name3:tt,$name4:tt,$name5:tt,$name6:tt,$int_type:tt) => {
+        impl<T:ZByteReaderTrait> ZReader<T>
+        {
+            #[inline(always)]
+            fn $name(&mut self, mode: Mode) -> $int_type
+            {
+                const SIZE_OF_VAL: usize = core::mem::size_of::<$int_type>();
+
+                let mut space = [0; SIZE_OF_VAL];
+
+                self.inner.read_const_bytes_no_error(&mut space);
+
+                match mode {
+                    Mode::BE => $int_type::from_be_bytes(space),
+                    Mode::LE => $int_type::from_le_bytes(space)
+                }
+            }
+
+            #[inline(always)]
+            fn $name2(&mut self, mode: Mode) -> Result<$int_type, ZByteIoError>
+            {
+                const SIZE_OF_VAL: usize = core::mem::size_of::<$int_type>();
+
+                let mut space = [0; SIZE_OF_VAL];
+
+                match self.inner.read_const_bytes(&mut space)
+                {
+                    Ok(_) => match mode {
+                        Mode::BE => Ok($int_type::from_be_bytes(space)),
+                        Mode::LE => Ok($int_type::from_le_bytes(space))
+                    },
+                     Err(e) =>  Err(e)
+                }
+            }
+            #[doc=concat!("Read ",stringify!($int_type)," as a big endian integer")]
+            #[doc=concat!("Returning an error if the underlying buffer cannot support a ",stringify!($int_type)," read.")]
+            #[inline]
+            pub fn $name3(&mut self) -> Result<$int_type, ZByteIoError>
+            {
+                self.$name2(Mode::BE)
+            }
+
+            #[doc=concat!("Read ",stringify!($int_type)," as a little endian integer")]
+            #[doc=concat!("Returning an error if the underlying buffer cannot support a ",stringify!($int_type)," read.")]
+            #[inline]
+            pub fn $name4(&mut self) -> Result<$int_type, ZByteIoError>
+            {
+                self.$name2(Mode::LE)
+            }
+            #[doc=concat!("Read ",stringify!($int_type)," as a big endian integer")]
+            #[doc=concat!("Returning 0 if the underlying  buffer does not have enough bytes for a ",stringify!($int_type)," read.")]
+            #[inline(always)]
+            pub fn $name5(&mut self) -> $int_type
+            {
+                self.$name(Mode::BE)
+            }
+            #[doc=concat!("Read ",stringify!($int_type)," as a little endian integer")]
+            #[doc=concat!("Returning 0 if the underlying buffer does not have enough bytes for a ",stringify!($int_type)," read.")]
+            #[inline(always)]
+            pub fn $name6(&mut self) -> $int_type
+            {
+                self.$name(Mode::LE)
+            }
+        }
+    };
+}
+
+get_single_type!(
+    get_u16_inner_or_default,
+    get_u16_inner_or_die,
+    get_u16_be_err,
+    get_u16_le_err,
+    get_u16_be,
+    get_u16_le,
+    u16
+);
+get_single_type!(
+    get_u32_inner_or_default,
+    get_u32_inner_or_die,
+    get_u32_be_err,
+    get_u32_le_err,
+    get_u32_be,
+    get_u32_le,
+    u32
+);
+get_single_type!(
+    get_u64_inner_or_default,
+    get_u64_inner_or_die,
+    get_u64_be_err,
+    get_u64_le_err,
+    get_u64_be,
+    get_u64_le,
+    u64
+);
+
+#[cfg(feature = "std")]
+impl<T> std::io::Read for ZReader<T>
+where
+    T: ZByteReaderTrait
+{
+    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
+        use std::io::ErrorKind;
+        self.read_bytes(buf)
+            .map_err(|e| std::io::Error::new(ErrorKind::Other, format!("{:?}", e)))
+    }
+}
--- a/third_party/zune-core/src/bytestream/reader/no_std_readers.rs
+++ b/third_party/zune-core/src/bytestream/reader/no_std_readers.rs
@ -0,0 +1,198 @@
+use crate::bytestream::reader::{ZByteIoError, ZSeekFrom};
+use crate::bytestream::ZByteReaderTrait;
+/// Wraps an in memory buffer providing it with a `Seek` method
+/// but works in `no_std` environments
+///
+/// `std::io::Cursor` is available in std environments, but we also need support
+/// for `no_std` environments so this serves as a drop in replacement
+pub struct ZCursor<T: AsRef<[u8]>> {
+    stream:   T,
+    position: usize
+}
+
+impl<T: AsRef<[u8]>> ZCursor<T> {
+    pub fn new(buffer: T) -> ZCursor<T> {
+        ZCursor {
+            stream:   buffer,
+            position: 0
+        }
+    }
+}
+
+impl<T: AsRef<[u8]>> ZCursor<T> {
+    /// Move forward `num` bytes  from
+    /// the current position.
+    ///
+    /// It doesn't check that position overflowed, new position
+    /// may point past the internal buffer, all subsequent reads will
+    /// either return an error or zero depending on the method called
+    #[inline]
+    pub fn skip(&mut self, num: usize) {
+        // Can this overflow ??
+        self.position = self.position.wrapping_add(num);
+    }
+    /// Move back `num` bytes from the current position
+    ///
+    ///
+    /// This saturates at zero, it can never be negative or wraparound
+    /// when the value becomes too small
+    #[inline]
+    pub fn rewind(&mut self, num: usize) {
+        self.position = self.position.saturating_sub(num);
+    }
+}
+
+impl<T: AsRef<[u8]>> ZByteReaderTrait for ZCursor<T> {
+    #[inline(always)]
+    fn read_byte_no_error(&mut self) -> u8 {
+        let byte = self.stream.as_ref().get(self.position).unwrap_or(&0);
+        self.position += 1;
+        *byte
+    }
+    #[inline(always)]
+    fn read_exact_bytes(&mut self, buf: &mut [u8]) -> Result<(), ZByteIoError> {
+        let bytes_read = self.read_bytes(buf)?;
+        if bytes_read != buf.len() {
+            // restore read to initial position it was in.
+            self.rewind(bytes_read);
+            // not all bytes were read.
+            return Err(ZByteIoError::NotEnoughBytes(bytes_read, buf.len()));
+        }
+        Ok(())
+    }
+
+    fn read_const_bytes<const N: usize>(&mut self, buf: &mut [u8; N]) -> Result<(), ZByteIoError> {
+        if self.position + N <= self.stream.as_ref().len() {
+            // we are in bounds
+            let reference = self.stream.as_ref();
+            let position = self.position;
+            if let Some(buf_ref) = reference.get(position..position + N) {
+                self.position += N;
+                buf.copy_from_slice(buf_ref);
+                return Ok(());
+            }
+        }
+        Err(ZByteIoError::Generic("Cannot satisfy read"))
+    }
+
+    fn read_const_bytes_no_error<const N: usize>(&mut self, buf: &mut [u8; N]) {
+        if self.position + N <= self.stream.as_ref().len() {
+            // we are in bounds
+            let reference = self.stream.as_ref();
+            let position = self.position;
+            if let Some(buf_ref) = reference.get(position..position + N) {
+                self.position += N;
+                buf.copy_from_slice(buf_ref);
+            }
+        }
+    }
+
+    #[inline(always)]
+    fn read_bytes(&mut self, buf: &mut [u8]) -> Result<usize, ZByteIoError> {
+        let len = self.peek_bytes(buf)?;
+        self.skip(len);
+        Ok(len)
+    }
+
+    #[inline(always)]
+    fn peek_bytes(&mut self, buf: &mut [u8]) -> Result<usize, ZByteIoError> {
+        let stream_end = self.stream.as_ref().len();
+
+        let start = core::cmp::min(self.position, stream_end);
+        let end = core::cmp::min(self.position + buf.len(), stream_end);
+
+        let slice = self.stream.as_ref().get(start..end).unwrap();
+        buf[..slice.len()].copy_from_slice(slice);
+        let len = slice.len();
+
+        Ok(len)
+    }
+
+    #[inline(always)]
+    fn peek_exact_bytes(&mut self, buf: &mut [u8]) -> Result<(), ZByteIoError> {
+        self.read_exact_bytes(buf)?;
+        self.rewind(buf.len());
+        Ok(())
+    }
+
+    #[inline(always)]
+    fn z_seek(&mut self, from: ZSeekFrom) -> Result<u64, ZByteIoError> {
+        let (base_pos, offset) = match from {
+            ZSeekFrom::Start(n) => {
+                self.position = n as usize;
+                return Ok(n);
+            }
+            ZSeekFrom::End(n) => (self.stream.as_ref().len(), n as isize),
+            ZSeekFrom::Current(n) => (self.position, n as isize)
+        };
+        match base_pos.checked_add_signed(offset) {
+            Some(n) => {
+                self.position = n;
+                Ok(self.position as u64)
+            }
+            None => Err(ZByteIoError::SeekError("Negative seek"))
+        }
+    }
+
+    #[inline(always)]
+    fn is_eof(&mut self) -> Result<bool, ZByteIoError> {
+        Ok(self.position >= self.stream.as_ref().len())
+    }
+    #[inline(always)]
+    fn z_position(&mut self) -> Result<u64, ZByteIoError> {
+        Ok(self.position as u64)
+    }
+
+    fn read_remaining(&mut self, sink: &mut alloc::vec::Vec<u8>) -> Result<usize, ZByteIoError> {
+        let start = self.position;
+        let end = self.stream.as_ref().len();
+        match self.stream.as_ref().get(start..end) {
+            None => {
+                return Err(ZByteIoError::Generic(
+                    "Somehow read remaining couldn't satisfy it's invariants"
+                ))
+            }
+            Some(e) => {
+                sink.extend_from_slice(e);
+            }
+        }
+        self.skip(end - start);
+        Ok(end - start)
+    }
+}
+
+#[cfg(feature = "std")]
+impl<T: AsRef<[u8]>> std::io::Seek for ZCursor<T> {
+    fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result<u64> {
+        let (base_pos, offset) = match pos {
+            std::io::SeekFrom::Start(n) => {
+                self.position = n as usize;
+                return Ok(n);
+            }
+            std::io::SeekFrom::End(n) => (self.stream.as_ref().len(), n as isize),
+            std::io::SeekFrom::Current(n) => (self.position, n as isize)
+        };
+        match base_pos.checked_add_signed(offset) {
+            Some(n) => {
+                self.position = n;
+                Ok(self.position as u64)
+            }
+            None => Err(std::io::Error::new(
+                std::io::ErrorKind::Other,
+                "Negative seek"
+            ))
+        }
+    }
+}
+//
+// #[cfg(feature = "std")]
+// impl<T: AsRef<[u8]>> std::io::Read for ZCursor<T> {
+//     fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
+//         self.read_bytes(buf).map_err(|x|{ std::io::Error::new()})
+//     }
+// }
+impl<T: AsRef<[u8]>> From<T> for ZCursor<T> {
+    fn from(value: T) -> Self {
+        ZCursor::new(value)
+    }
+}
--- a/third_party/zune-core/src/bytestream/reader/std_readers.rs
+++ b/third_party/zune-core/src/bytestream/reader/std_readers.rs
@ -0,0 +1,100 @@
+#![cfg(feature = "std")]
+
+use std::io;
+use std::io::SeekFrom;
+
+use crate::bytestream::reader::{ZByteIoError, ZSeekFrom};
+use crate::bytestream::ZByteReaderTrait;
+// note (cae): If Rust ever stabilizes trait specialization, specialize this for Cursor
+impl<T: io::BufRead + io::Seek> ZByteReaderTrait for T {
+    #[inline(always)]
+    fn read_byte_no_error(&mut self) -> u8 {
+        let mut buf = [0];
+        let _ = self.read(&mut buf);
+        buf[0]
+    }
+    #[inline(always)]
+    fn read_exact_bytes(&mut self, buf: &mut [u8]) -> Result<(), ZByteIoError> {
+        let mut bytes_read = 0;
+
+        while bytes_read < buf.len() {
+            match self.read(&mut buf[bytes_read..]) {
+                Ok(0) => {
+                    // if a read returns zero bytes read, it means it encountered an EOF so we seek
+                    // back to where we started because some paths may aggressively read forward and
+                    // ZCursor maintains the position.
+
+                    // NB: (cae) [tag=perf] This adds a branch on every read, and will slow down every function
+                    // resting on it. Sorry
+                    self.seek(SeekFrom::Current(-(bytes_read as i64)))
+                        .map_err(ZByteIoError::from)?;
+                    return Err(ZByteIoError::NotEnoughBytes(bytes_read, buf.len()));
+                }
+                Ok(bytes) => {
+                    bytes_read += bytes;
+                }
+                Err(e) => return Err(ZByteIoError::from(e))
+            }
+        }
+
+        Ok(())
+    }
+
+    #[inline]
+    fn read_const_bytes<const N: usize>(&mut self, buf: &mut [u8; N]) -> Result<(), ZByteIoError> {
+        self.read_exact_bytes(buf)
+    }
+
+    fn read_const_bytes_no_error<const N: usize>(&mut self, buf: &mut [u8; N]) {
+        let _ = self.read_const_bytes(buf);
+    }
+
+    #[inline(always)]
+    fn read_bytes(&mut self, buf: &mut [u8]) -> Result<usize, ZByteIoError> {
+        self.read(buf).map_err(ZByteIoError::from)
+    }
+
+    #[inline(always)]
+    fn peek_bytes(&mut self, buf: &mut [u8]) -> Result<usize, ZByteIoError> {
+        // first read bytes to the buffer
+        let bytes_read = self.read_bytes(buf)?;
+        let converted = -i64::try_from(bytes_read).map_err(ZByteIoError::from)?;
+        self.seek(std::io::SeekFrom::Current(converted))
+            .map_err(ZByteIoError::from)?;
+
+        Ok(bytes_read)
+    }
+
+    #[inline(always)]
+    fn peek_exact_bytes(&mut self, buf: &mut [u8]) -> Result<(), ZByteIoError> {
+        // first read bytes to the buffer
+        self.read_exact_bytes(buf)?;
+        let converted = -i64::try_from(buf.len()).map_err(ZByteIoError::from)?;
+        self.seek(std::io::SeekFrom::Current(converted))
+            .map_err(ZByteIoError::from)?;
+
+        Ok(())
+    }
+
+    #[inline(always)]
+    fn z_seek(&mut self, from: ZSeekFrom) -> Result<u64, ZByteIoError> {
+        self.seek(from.to_std_seek()).map_err(ZByteIoError::from)
+    }
+
+    #[inline(always)]
+    fn is_eof(&mut self) -> Result<bool, ZByteIoError> {
+        self.fill_buf()
+            .map(|b| b.is_empty())
+            .map_err(ZByteIoError::from)
+    }
+
+    #[inline(always)]
+    fn z_position(&mut self) -> Result<u64, ZByteIoError> {
+        self.stream_position().map_err(ZByteIoError::from)
+    }
+
+    #[inline(always)]
+    fn read_remaining(&mut self, sink: &mut Vec<u8>) -> Result<usize, ZByteIoError> {
+        self.read_to_end(sink).map_err(ZByteIoError::from)
+    }
+}
--- a/third_party/zune-core/src/bytestream/traits.rs
+++ b/third_party/zune-core/src/bytestream/traits.rs
@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+//! Traits for reading and writing images in zune
+//!
+//!
+//! This exposes the traits and implementations for readers
+//! and writers in the zune family of decoders and encoders.
+
+use crate::bytestream::reader::{ZByteIoError, ZSeekFrom};
+
+/// The de-facto Input trait implemented for readers.
+///
+/// This provides the basic functions needed to quick and sometimes
+/// heap free I/O for the zune image decoders with easy support for extending it
+/// to multiple implementations.
+///
+/// # Considerations
+///
+/// If you have an in memory buffer, prefer [`ZCursor`](crate::bytestream::ZCursor) over [`Cursor`](std::io::Cursor).
+/// We implement this trait for two types, `ZCursor`, and any thing that implements `BufRead`+`Seek`, `Cursor` falls in the latter
+/// and since Rust doesn't have specialization for traits, we can only implement it once. This means functions like
+/// [`read_byte_no_error`](crate::bytestream::ZByteReaderTrait::read_byte_no_error) are slower than they should be for `Cursor`.
+///
+pub trait ZByteReaderTrait {
+    /// Read a single byte from the decoder and return
+    /// `0` if we can't read the byte, e.g because of EOF
+    ///
+    /// The implementation should try to be as fast as possible as this is called
+    /// from some hot loops where it may become the bottleneck
+    fn read_byte_no_error(&mut self) -> u8;
+    /// Read exact bytes required to fill `buf` or return an error if that isn't possible
+    ///
+    /// ## Arguments
+    ///  - `buf`: Buffer to fill with bytes from the underlying reader
+    ///  ## Errors
+    /// In case of an error, the implementation should not increment the internal position
+    fn read_exact_bytes(&mut self, buf: &mut [u8]) -> Result<(), ZByteIoError>;
+
+    /// Read exact bytes required to fill `buf` or return an error if that isn't possible
+    ///
+    /// This is the same as [`read_exact_bytes`](Self::read_exact_bytes) but implemented as a separate
+    /// method to allow some implementations to optimize it to cost fewer instructions
+    ///
+    /// ## Arguments
+    ///  - `buf`: Buffer to fill with bytes from the underlying reader
+    ///  ## Errors
+    /// In case of an error, the implementation should not increment the internal position
+    fn read_const_bytes<const N: usize>(&mut self, buf: &mut [u8; N]) -> Result<(), ZByteIoError>;
+
+    /// Read exact bytes required to fill `buf` or ignore buf entirely if you can't fill it
+    /// due to an error like the inability to fill the buffer completely
+    /// ## Arguments
+    ///  - `buf`: Buffer to fill with bytes from the underlying reader
+    /// ## Errors
+    /// In case of an error, the implementation should not increment the internal position
+    fn read_const_bytes_no_error<const N: usize>(&mut self, buf: &mut [u8; N]);
+
+    /// Read bytes into `buf` returning how many bytes you have read or an error if one occurred
+    ///
+    /// This doesn't guarantee that buf will be filled with bytes for such a guarantee see
+    /// [`read_exact_bytes`](Self::read_exact_bytes)
+    ///
+    /// ## Arguments
+    /// - `buf`: The buffer to fill with bytes
+    ///
+    /// ## Returns
+    ///  - `Ok(usize)` - Actual bytes read into the buffer
+    ///  - `Err()` - The error encountered when reading bytes for which we couldn't recover
+    fn read_bytes(&mut self, buf: &mut [u8]) -> Result<usize, ZByteIoError>;
+    /// Reads data into provided buffer but does not advance read position.
+    ///
+    ///
+    fn peek_bytes(&mut self, buf: &mut [u8]) -> Result<usize, ZByteIoError>;
+    fn peek_exact_bytes(&mut self, buf: &mut [u8]) -> Result<(), ZByteIoError>;
+    /// Seek into a new position from the buffer
+    ///
+    /// This is similar to the [seek](std::io::Seek::seek) function in the [Seek](std::io::Seek) trait
+    /// but implemented to work for no-std environments
+    fn z_seek(&mut self, from: ZSeekFrom) -> Result<u64, ZByteIoError>;
+    /// Report whether we are at the end of a stream.
+    ///
+    /// ## Warning
+    /// This may cause an additional syscall e.g when we are reading from a file, we must query the file
+    /// multiple times to check if we really are at the end of the file and the user didn't sneakily
+    /// add more contents to it hence use it with care
+    ///
+    /// ## Returns
+    /// - `Ok(bool)` - The answer to whether or not we are at end of file
+    /// - `Err()` - The error that occurred when we queried the underlying reader if we were at EOF
+    fn is_eof(&mut self) -> Result<bool, ZByteIoError>;
+
+    /// Return the current position of the inner cursor.
+    ///
+    /// This can be used to check the advancement of the cursor
+    fn z_position(&mut self) -> Result<u64, ZByteIoError>;
+    /// Read all bytes remaining in this input to `sink` until we hit eof
+    ///
+    /// # Returns
+    /// - `Ok(usize)` The actual number of bytes added to the sink
+    /// - `Err()` An error that occurred when reading bytes
+    fn read_remaining(&mut self, sink: &mut alloc::vec::Vec<u8>) -> Result<usize, ZByteIoError>;
+}
+
+/// The writer trait implemented for zune-image library of encoders
+///
+/// Anything that implements this trait can be used as a sink
+/// for writing encoded images
+pub trait ZByteWriterTrait {
+    /// Write some bytes into the sink returning number of bytes written or
+    /// an error if something bad happened
+    ///
+    /// An implementation is free to write less bytes that are in buf, so the bytes written
+    /// cannot be guaranteed to be fully written
+    fn write_bytes(&mut self, buf: &[u8]) -> Result<usize, ZByteIoError>;
+    /// Write all bytes to the buffer or return an error if something occurred
+    ///
+    /// This will always write all bytes, if it can't fully write all bytes, it will
+    /// error out
+    fn write_all_bytes(&mut self, buf: &[u8]) -> Result<(), ZByteIoError>;
+    /// Write a fixed number of bytes and error out if we can't write the bytes
+    ///
+    /// This is provided to allow for optimized writes where possible. (when the compiler can const fold them)
+    fn write_const_bytes<const N: usize>(&mut self, buf: &[u8; N]) -> Result<(), ZByteIoError>;
+    /// Ensure bytes are written to the sink.
+    ///
+    /// Implementations should treat this like linux `fsync`, and should implement
+    /// whatever writer's implementation of fsync should look like
+    ///
+    /// After this, the encoder should be able to guarantee that all in-core data is synced with the
+    /// storage decive
+    fn flush_bytes(&mut self) -> Result<(), ZByteIoError>;
+
+    /// A hint to tell the implementation how big of a size we expect the image to be
+    /// An implementation like in memory `Vec` can use this to reserve additional memory to
+    /// prevent reallocation when encoding
+    ///
+    /// This is just a hint, akin to calling `Vec::reserve` and should be treated as such.
+    /// If your implementation doesn't support such, e.g file or mutable slices, it's okay to return
+    /// `Ok(())`
+    fn reserve_capacity(&mut self, size: usize) -> Result<(), ZByteIoError>;
+}
--- a/third_party/zune-core/src/bytestream/writer.rs
+++ b/third_party/zune-core/src/bytestream/writer.rs
@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+use crate::bytestream::{ZByteIoError, ZByteWriterTrait};
+
+mod no_std_writer;
+mod std_writer;
+
+enum Mode {
+    // Big endian
+    BE,
+    // Little Endian
+    LE
+}
+
+/// Encapsulates a simple Byte writer with
+/// support for Endian aware writes
+pub struct ZWriter<T: ZByteWriterTrait> {
+    buffer:        T,
+    bytes_written: usize
+}
+
+impl<T: ZByteWriterTrait> ZWriter<T> {
+    /// Write bytes from the buf into the bytestream
+    /// and return how many bytes were written
+    ///
+    /// # Arguments
+    /// - `buf`: The bytes to be written to the bytestream
+    ///
+    /// # Returns
+    /// - `Ok(usize)` - Number of bytes written
+    /// This number may be less than `buf.len()` if the length of the buffer is greater
+    /// than the internal bytestream length
+    ///  
+    /// If you want to be sure that all bytes were written, see [`write_all`](Self::write_all)
+    ///
+    #[inline]
+    pub fn write(&mut self, buf: &[u8]) -> Result<usize, ZByteIoError> {
+        let bytes_written = self.buffer.write_bytes(buf)?;
+        self.bytes_written += bytes_written;
+        Ok(bytes_written)
+    }
+    /// Write all bytes from `buf` into the bytestream and return
+    /// and panic if not all bytes were written to the bytestream
+    ///
+    /// # Arguments
+    /// - `buf`: The bytes to be written into the bytestream
+    ///
+    ///# Returns
+    /// - `Ok(())`: Indicates all bytes were written into the bytestream
+    /// - `Err(&static str)`: In case all the bytes could not be written
+    /// to the stream
+    pub fn write_all(&mut self, buf: &[u8]) -> Result<(), ZByteIoError> {
+        self.buffer.write_all_bytes(buf)?;
+        self.bytes_written += buf.len();
+        Ok(())
+    }
+    /// Create a new bytestream writer
+    /// Bytes are written from the start to the end and not assumptions
+    /// are made of the nature of the underlying stream
+    ///
+    /// # Arguments
+    pub fn new(data: T) -> ZWriter<T> {
+        ZWriter {
+            buffer:        data,
+            bytes_written: 0
+        }
+    }
+
+    /// Write a single byte into the bytestream or error out
+    /// if there is not enough space
+    ///
+    /// # Example
+    /// ```
+    /// use zune_core::bytestream::ZWriter;
+    /// let mut buf = [0;10];
+    /// let mut stream  =  ZWriter::new(&mut buf[..]);
+    /// assert!(stream.write_u8_err(34).is_ok());
+    /// ```
+    /// No space
+    /// ```
+    /// use zune_core::bytestream::ZWriter;
+    /// let mut no_space = [];
+    /// let mut stream = ZWriter::new(&mut no_space[..]);
+    /// assert!(stream.write_u8_err(32).is_err());
+    /// ```
+    ///
+    #[inline]
+    pub fn write_u8_err(&mut self, byte: u8) -> Result<(), ZByteIoError> {
+        self.write_const_bytes(&[byte])
+    }
+    /// Write a fixed compile time known number of bytes to the sink
+    ///
+    /// This is provided since some implementations can optimize such writes by eliminating
+    /// some redundant code.
+    #[inline]
+    pub fn write_const_bytes<const N: usize>(
+        &mut self, byte: &[u8; N]
+    ) -> Result<(), ZByteIoError> {
+        self.buffer.write_const_bytes(byte)?;
+        self.bytes_written += N;
+        Ok(())
+    }
+
+    /// Write a single byte in the stream or don't write
+    /// anything if the buffer is full and cannot support the byte read
+    ///
+    #[inline]
+    pub fn write_u8(&mut self, byte: u8) {
+        let _ = self.write_const_bytes(&[byte]);
+    }
+    /// Return the number of bytes written by this encoder
+    ///
+    /// The encoder keeps information of how many bytes were written and this method
+    /// returns that value.
+    ///
+    /// # Returns
+    ///  Number of bytes written
+    pub fn bytes_written(&self) -> usize {
+        self.bytes_written
+    }
+
+    /// Reserve some additional space to write.
+    ///
+    /// Some sinks like `Vec<u8>` allow reallocation and to prevent too much reallocation
+    /// one can use this to reserve additional space to encode
+    ///
+    /// # Example
+    ///  
+    /// ```
+    /// use zune_core::bytestream::ZWriter;
+    /// let space_needed = 10; // Assume the image will fit into 10 bytes
+    /// let mut output = Vec::new();
+    /// let mut sink = ZWriter::new(&mut output);
+    /// // now reserve some space
+    ///sink.reserve(space_needed).unwrap();
+    /// // at this point, we can assume that ZWriter allocated space for output
+    /// ```
+    pub fn reserve(&mut self, additional: usize) -> Result<(), ZByteIoError> {
+        self.buffer.reserve_capacity(additional)
+    }
+    /// Consume the writer and return the inner sink
+    /// we were writing to.
+    ///
+    /// After this, the writer can no longer be used
+    pub fn inner(self) -> T {
+        self.buffer
+    }
+    /// Return an immutable reference to the inner sink
+    pub fn inner_ref(&self) -> &T {
+        &self.buffer
+    }
+    /// Return a mutable reference to the inner sink
+    pub fn inner_mut(&mut self) -> &mut T {
+        &mut self.buffer
+    }
+}
+
+macro_rules! write_single_type {
+    ($name:tt,$name2:tt,$name3:tt,$name4:tt,$name5:tt,$name6:tt,$int_type:tt) => {
+        impl<T:ZByteWriterTrait> ZWriter<T>
+        {
+            #[inline(always)]
+            fn $name(&mut self, byte: $int_type, mode: Mode) -> Result<(), ZByteIoError>
+            {
+
+                 // get bits, depending on mode.
+                 // This should be inlined and not visible in
+                 // the generated binary since mode is a compile
+                 // time constant.
+                  let bytes = match mode
+                   {
+                         Mode::BE => byte.to_be_bytes(),
+                         Mode::LE => byte.to_le_bytes()
+                  };
+                 self.write_const_bytes(&bytes)
+            }
+            #[inline(always)]
+            fn $name2(&mut self, byte: $int_type, mode: Mode)
+            {
+
+                 // get bits, depending on mode.
+                 // This should be inlined and not visible in
+                 // the generated binary since mode is a compile
+                 // time constant.
+                  let bytes = match mode
+                   {
+                         Mode::BE => byte.to_be_bytes(),
+                         Mode::LE => byte.to_le_bytes()
+                  };
+                 let _ = self.write_const_bytes(&bytes);
+
+
+            }
+
+            #[doc=concat!("Write ",stringify!($int_type)," as a big endian integer")]
+            #[doc=concat!("Returning an error if the underlying buffer cannot support a ",stringify!($int_type)," write.")]
+            #[inline]
+            pub fn $name3(&mut self, byte: $int_type) -> Result<(), ZByteIoError>
+            {
+                self.$name(byte, Mode::BE)
+            }
+
+            #[doc=concat!("Write ",stringify!($int_type)," as a little endian integer")]
+            #[doc=concat!("Returning an error if the underlying buffer cannot support a ",stringify!($int_type)," write.")]
+            #[inline]
+            pub fn $name4(&mut self, byte: $int_type) -> Result<(), ZByteIoError>
+            {
+                self.$name(byte, Mode::LE)
+            }
+
+            #[doc=concat!("Write ",stringify!($int_type)," as a big endian integer")]
+            #[doc=concat!("Or don't write anything if the reader cannot support a ",stringify!($int_type)," write.")]
+            #[inline]
+            pub fn $name5(&mut self, byte: $int_type)
+            {
+                self.$name2(byte, Mode::BE)
+            }
+            #[doc=concat!("Write ",stringify!($int_type)," as a little endian integer")]
+            #[doc=concat!("Or don't write anything if the reader cannot support a ",stringify!($int_type)," write.")]
+            #[inline]
+            pub fn $name6(&mut self, byte: $int_type)
+            {
+                self.$name2(byte, Mode::LE)
+            }
+        }
+    };
+}
+
+write_single_type!(
+    write_u64_inner_or_die,
+    write_u64_inner_or_none,
+    write_u64_be_err,
+    write_u64_le_err,
+    write_u64_be,
+    write_u64_le,
+    u64
+);
+
+write_single_type!(
+    write_u32_inner_or_die,
+    write_u32_inner_or_none,
+    write_u32_be_err,
+    write_u32_le_err,
+    write_u32_be,
+    write_u32_le,
+    u32
+);
+
+write_single_type!(
+    write_u16_inner_or_die,
+    write_u16_inner_or_none,
+    write_u16_be_err,
+    write_u16_le_err,
+    write_u16_be,
+    write_u16_le,
+    u16
+);
--- a/third_party/zune-core/src/bytestream/writer/no_std_writer.rs
+++ b/third_party/zune-core/src/bytestream/writer/no_std_writer.rs
@ -0,0 +1,70 @@
+// We cannot use the below impls and std ones because we'll re-implement the
+// same trait fot &[u8] which is blanketed by write. Ending up with two separate implementations
+#![cfg(not(feature = "std"))]
+use crate::bytestream::{ZByteIoError, ZByteWriterTrait};
+
+impl ZByteWriterTrait for &mut [u8] {
+    fn write_bytes(&mut self, buf: &[u8]) -> Result<usize, ZByteIoError> {
+        // got from the write of std
+        let amt = core::cmp::min(buf.len(), self.len());
+        let (a, b) = core::mem::take(self).split_at_mut(amt);
+        a.copy_from_slice(&buf[..amt]);
+        *self = b;
+        Ok(amt)
+    }
+
+    fn write_all_bytes(&mut self, buf: &[u8]) -> Result<(), ZByteIoError> {
+        if buf.len() > self.len() {
+            return Err(ZByteIoError::NotEnoughBuffer(self.len(), buf.len()));
+        }
+        let amt = core::cmp::min(buf.len(), self.len());
+        let (a, b) = core::mem::take(self).split_at_mut(amt);
+        a.copy_from_slice(&buf[..amt]);
+        *self = b;
+
+        Ok(())
+    }
+
+    fn write_const_bytes<const N: usize>(&mut self, buf: &[u8; N]) -> Result<(), ZByteIoError> {
+        if N > self.len() {
+            return Err(ZByteIoError::NotEnoughBuffer(self.len(), N));
+        }
+        let amt = core::cmp::min(buf.len(), self.len());
+        let (a, b) = core::mem::take(self).split_at_mut(amt);
+        a.copy_from_slice(&buf[..amt]);
+        *self = b;
+        Ok(())
+    }
+
+    fn flush_bytes(&mut self) -> Result<(), ZByteIoError> {
+        Ok(())
+    }
+    fn reserve_capacity(&mut self, _: usize) -> Result<(), ZByteIoError> {
+        // can't really pre-allocate anything here
+        Ok(())
+    }
+}
+
+impl ZByteWriterTrait for &mut alloc::vec::Vec<u8> {
+    fn write_bytes(&mut self, buf: &[u8]) -> Result<usize, ZByteIoError> {
+        self.extend_from_slice(buf);
+        Ok(buf.len())
+    }
+
+    fn write_all_bytes(&mut self, buf: &[u8]) -> Result<(), ZByteIoError> {
+        self.extend_from_slice(buf);
+        Ok(())
+    }
+
+    fn write_const_bytes<const N: usize>(&mut self, buf: &[u8; N]) -> Result<(), ZByteIoError> {
+        self.extend_from_slice(buf);
+        Ok(())
+    }
+    fn flush_bytes(&mut self) -> Result<(), ZByteIoError> {
+        Ok(())
+    }
+    fn reserve_capacity(&mut self, size: usize) -> Result<(), ZByteIoError> {
+        self.reserve(size);
+        Ok(())
+    }
+}
--- a/third_party/zune-core/src/bytestream/writer/std_writer.rs
+++ b/third_party/zune-core/src/bytestream/writer/std_writer.rs
@ -0,0 +1,27 @@
+#![cfg(feature = "std")]
+
+use std::io::Write;
+
+use crate::bytestream::ZByteIoError;
+
+impl<T: Write> crate::bytestream::ZByteWriterTrait for T {
+    fn write_bytes(&mut self, buf: &[u8]) -> Result<usize, ZByteIoError> {
+        self.write(buf).map_err(ZByteIoError::StdIoError)
+    }
+
+    fn write_all_bytes(&mut self, buf: &[u8]) -> Result<(), ZByteIoError> {
+        self.write_all(buf).map_err(ZByteIoError::StdIoError)
+    }
+
+    fn write_const_bytes<const N: usize>(&mut self, buf: &[u8; N]) -> Result<(), ZByteIoError> {
+        self.write_all_bytes(buf)
+    }
+    fn flush_bytes(&mut self) -> Result<(), ZByteIoError> {
+        self.flush().map_err(ZByteIoError::StdIoError)
+    }
+    fn reserve_capacity(&mut self, _: usize) -> Result<(), ZByteIoError> {
+        // we can't reserve capacity, sorry to implementations where this
+        // matters
+        Ok(())
+    }
+}
--- a/third_party/zune-core/src/colorspace.rs
+++ b/third_party/zune-core/src/colorspace.rs
@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+//! Image Colorspace information and manipulation utilities.
+
+/// All possible image colorspaces
+/// Some of them aren't yet supported exist here.
+#[allow(clippy::upper_case_acronyms)]
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+#[non_exhaustive]
+pub enum ColorSpace {
+    /// Red, Green , Blue
+    RGB,
+    /// Red, Green, Blue, Alpha
+    RGBA,
+    /// YUV colorspace
+    YCbCr,
+    /// Grayscale colorspace
+    Luma,
+    /// Grayscale with alpha colorspace
+    LumaA,
+    YCCK,
+    /// Cyan , Magenta, Yellow, Black
+    CMYK,
+    /// Blue, Green, Red
+    BGR,
+    /// Blue, Green, Red, Alpha
+    BGRA,
+    /// The colorspace is unknown
+    Unknown,
+    /// Alpha Red Green Blue
+    ARGB,
+    /// Hue,Saturation,Lightness
+    /// Conversion from RGB to HSL and back matches that of Python [colorsys](https://docs.python.org/3/library/colorsys.html) module
+    /// Color type is expected to be in floating point
+    HSL,
+    /// Hue, Saturation,Value
+    ///
+    /// Conversion from RGB to HSV and back matches that of Python [colorsys](https://docs.python.org/3/library/colorsys.html) module
+    /// Color type is expected to be in floating point
+    HSV
+}
+
+impl ColorSpace {
+    /// Number of color channels present for a certain colorspace
+    ///
+    /// E.g. RGB returns 3 since it contains R,G and B colors to make up a pixel
+    pub const fn num_components(&self) -> usize {
+        match self {
+            Self::RGB | Self::YCbCr | Self::BGR | Self::HSV | Self::HSL => 3,
+            Self::RGBA | Self::YCCK | Self::CMYK | Self::BGRA | Self::ARGB => 4,
+            Self::Luma => 1,
+            Self::LumaA => 2,
+            Self::Unknown => 0
+        }
+    }
+
+    pub const fn has_alpha(&self) -> bool {
+        matches!(self, Self::RGBA | Self::LumaA | Self::BGRA | Self::ARGB)
+    }
+
+    pub const fn is_grayscale(&self) -> bool {
+        matches!(self, Self::LumaA | Self::Luma)
+    }
+
+    /// Returns the position of the alpha pixel in a pixel
+    ///
+    ///
+    /// That is for an array of color components say `[0,1,2,3]` if the image has an alpha channel
+    /// and is in RGBA format, this will return `Some(3)`, indicating alpha is found in the third index
+    /// but if the image is in `ARGB` format, it will return `Some(0)` indicating alpha is found in  
+    /// index 0
+    ///
+    /// If an image doesn't have an alpha channel returns `None`
+    ///
+    pub const fn alpha_position(&self) -> Option<usize> {
+        match self {
+            ColorSpace::RGBA => Some(3),
+            ColorSpace::LumaA => Some(1),
+            ColorSpace::BGRA => Some(3),
+            ColorSpace::ARGB => Some(0),
+            _ => None
+        }
+    }
+}
+
+/// Encapsulates all colorspaces supported by
+/// the library
+pub static ALL_COLORSPACES: [ColorSpace; 12] = [
+    ColorSpace::RGB,
+    ColorSpace::RGBA,
+    ColorSpace::LumaA,
+    ColorSpace::Luma,
+    ColorSpace::CMYK,
+    ColorSpace::BGRA,
+    ColorSpace::BGR,
+    ColorSpace::YCCK,
+    ColorSpace::YCbCr,
+    ColorSpace::ARGB,
+    ColorSpace::HSL,
+    ColorSpace::HSV
+];
+
+/// Color characteristics
+///
+/// Gives more information about values in a certain
+/// colorspace
+#[allow(non_camel_case_types)]
+#[derive(Copy, Clone, Debug, PartialEq)]
+pub enum ColorCharacteristics {
+    /// Normal default gamma setting
+    /// The float contains gamma present
+    ///
+    /// The default gamma value is 2.2 but for
+    /// decoders that allow specifying gamma values,e.g PNG,
+    /// the gamma value becomes the specified value by the decoder
+    sRGB,
+    /// Linear transfer characteristics
+    /// The image is in linear colorspace
+    Linear
+}
+/// Represents a single channel color primary.
+///
+/// This can be viewed as a 3D coordinate of the color primary
+/// for a given colorspace
+#[derive(Default, Debug, Copy, Clone)]
+pub struct SingleColorPrimary {
+    pub x: f64,
+    pub y: f64,
+    pub z: f64
+}
+/// A collection of red,green and blue color primaries placed
+/// in one struct for easy manipulation
+#[derive(Default, Debug, Copy, Clone)]
+pub struct ColorPrimaries {
+    /// Red color primaries
+    pub red:   SingleColorPrimary,
+    /// Green color primaries
+    pub green: SingleColorPrimary,
+    /// Blue color primaries
+    pub blue:  SingleColorPrimary
+}
+
+/// Rendering intents indicate what one may want to do with colors outside of it's gamut
+///
+///
+/// Further reading
+///  - [IBM Rendering Intent](https://www.ibm.com/docs/en/i/7.5?topic=management-rendering-intents)
+///  - [ColorGate Blog](https://blog.colorgate.com/en/rendering-intent-explained)   
+#[derive(Eq, PartialEq, Clone, Copy, Debug)]
+pub enum RenderingIntent {
+    AbsoluteColorimetric,
+    Saturation,
+    RelativeColorimetric,
+    Perceptual
+}
--- a/third_party/zune-core/src/lib.rs
+++ b/third_party/zune-core/src/lib.rs
@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software; You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+//! Core routines shared by all libraries
+//!
+//! This crate provides a set of core routines shared
+//! by the decoders and encoders under `zune` umbrella
+//!
+//! It currently contains
+//!
+//! - A bytestream reader and writer with endian aware reads and writes
+//! - Colorspace and bit depth information shared by images
+//! - Image decoder and encoder options
+//! - A simple enum type to hold image decoding results.
+//!
+//! This library is `#[no_std]` with `alloc` feature needed for defining `Vec`
+//! which we need for storing decoded  bytes.
+//!
+//!
+//! # Features
+//!  - `no_std`: Enables `#[no_std]` compilation support.
+//!
+//!  - `serde`: Enables serializing of some of the data structures
+//!     present in the crate
+//!
+//!
+//! # Input/Output
+//!
+//! zune-image supports many different input and output devices. For input readers
+//! we can read anything that implements `BufRead` + `Seek` and provide an optimized routine for
+//! handling in memory buffers by using [`ZCursor`](crate::bytestream::ZCursor).
+//!
+//! For output, we support anything that implements `Write` trait, this includes files, standard io streams
+//! network sockets, etc
+//!
+//! In a `no_std` environment. We can write to in memory buffers `&mut [u8]` and `&mut Vec<u8>`
+//!
+//! If you have an in memory buffer, use [`ZCursor`](crate::bytestream::ZCursor),
+//! it's optimized for in memory buffers.
+//!
+//!  
+//!
+#![cfg_attr(not(feature = "std"), no_std)]
+#![macro_use]
+extern crate alloc;
+extern crate core;
+
+#[cfg(not(feature = "log"))]
+pub mod log;
+
+#[cfg(feature = "log")]
+pub use log;
+
+pub mod bit_depth;
+pub mod bytestream;
+pub mod colorspace;
+pub mod options;
+pub mod result;
+mod serde;
--- a/third_party/zune-core/src/log.rs
+++ b/third_party/zune-core/src/log.rs
@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+// #[macro_export] is required to make macros works across crates
+// but it always put the macro in the crate root.
+// #[doc(hidden)] + "pub use" is a workaround to namespace a macro.
+pub use crate::{
+    __debug as debug, __error as error, __info as info, __log_enabled as log_enabled,
+    __trace as trace, __warn as warn
+};
+
+#[repr(usize)]
+#[derive(Copy, Clone, Eq, PartialEq, Debug, Hash)]
+pub enum Level {
+    Error = 1,
+    Warn,
+    Info,
+    Debug,
+    Trace
+}
+
+#[doc(hidden)]
+#[macro_export]
+macro_rules! __log_enabled {
+    ($lvl:expr) => {{
+        let _ = $lvl;
+        false
+    }};
+}
+
+#[doc(hidden)]
+#[macro_export]
+macro_rules! __error {
+    ($($arg:tt)+) => {
+        #[cfg(feature = "std")]
+        {
+            //eprintln!($($arg)+);
+        }
+    };
+}
+
+#[doc(hidden)]
+#[macro_export]
+macro_rules! __warn {
+    ($($arg:tt)+) => {
+        #[cfg(feature = "std")]
+        {
+            //eprintln!($($arg)+);
+        }
+    };
+}
+
+#[doc(hidden)]
+#[macro_export]
+macro_rules! __info {
+    ($($arg:tt)+) => {};
+}
+
+#[doc(hidden)]
+#[macro_export]
+macro_rules! __debug {
+    ($($arg:tt)+) => {};
+}
+
+#[doc(hidden)]
+#[macro_export]
+macro_rules! __trace {
+    ($($arg:tt)+) => {};
+}
--- a/third_party/zune-core/src/options.rs
+++ b/third_party/zune-core/src/options.rs
@ -0,0 +1,13 @@
+//! Decoder and Encoder Options
+//!
+//! This module exposes a struct for which all implemented
+//! decoders get shared options for decoding
+//!
+//! All supported options are put into one _Options to allow for global configurations
+//! options e.g the same  `DecoderOption` can be reused for all other decoders
+//!
+pub use decoder::DecoderOptions;
+pub use encoder::EncoderOptions;
+
+mod decoder;
+mod encoder;
--- a/third_party/zune-core/src/options/decoder.rs
+++ b/third_party/zune-core/src/options/decoder.rs
@ -0,0 +1,666 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+//! Global Decoder options
+#![allow(clippy::zero_prefixed_literal)]
+
+use crate::bit_depth::ByteEndian;
+use crate::colorspace::ColorSpace;
+
+/// A decoder that can handle errors
+fn decoder_error_tolerance_mode() -> DecoderFlags {
+    // similar to fast options currently, so no need to write a new one
+    fast_options()
+}
+/// Fast decoder options
+///
+/// Enables all intrinsics + unsafe routines
+///
+/// Disables png adler and crc checking.
+fn fast_options() -> DecoderFlags {
+    DecoderFlags {
+        inflate_confirm_adler:        false,
+        png_confirm_crc:              false,
+        jpg_error_on_non_conformance: false,
+
+        zune_use_unsafe: true,
+        zune_use_neon:   true,
+        zune_use_avx:    true,
+        zune_use_avx2:   true,
+        zune_use_sse2:   true,
+        zune_use_sse3:   true,
+        zune_use_sse41:  true,
+
+        png_add_alpha_channel:     false,
+        png_strip_16_bit_to_8_bit: false,
+        png_decode_animated:       true,
+        jxl_decode_animated:       true
+    }
+}
+
+/// Command line options error resilient and fast
+///
+/// Features
+/// - Ignore CRC and Adler in png
+/// - Do not error out on non-conformance in jpg
+/// - Use unsafe paths
+fn cmd_options() -> DecoderFlags {
+    DecoderFlags {
+        inflate_confirm_adler:        false,
+        png_confirm_crc:              false,
+        jpg_error_on_non_conformance: false,
+
+        zune_use_unsafe: true,
+        zune_use_neon:   true,
+        zune_use_avx:    true,
+        zune_use_avx2:   true,
+        zune_use_sse2:   true,
+        zune_use_sse3:   true,
+        zune_use_sse41:  true,
+
+        png_add_alpha_channel:     false,
+        png_strip_16_bit_to_8_bit: false,
+
+        png_decode_animated: true,
+        jxl_decode_animated: true
+    }
+}
+
+/// Decoder options that are flags
+///
+/// NOTE: When you extend this, add true or false to
+/// all options above that return a `DecoderFlag`
+#[derive(Copy, Debug, Clone, Default)]
+pub struct DecoderFlags {
+    /// Whether the decoder should confirm and report adler mismatch
+    inflate_confirm_adler:        bool,
+    /// Whether the PNG decoder should confirm crc
+    png_confirm_crc:              bool,
+    /// Whether the png decoder should error out on image non-conformance
+    jpg_error_on_non_conformance: bool,
+    /// Whether the decoder should use unsafe  platform specific intrinsics
+    ///
+    /// This will also shut down platform specific intrinsics `(ZUNE_USE_{EXT})` value
+    zune_use_unsafe:              bool,
+    /// Whether we should use SSE2.
+    ///
+    /// This should be enabled for all x64 platforms but can be turned off if
+    /// `ZUNE_USE_UNSAFE` is false
+    zune_use_sse2:                bool,
+    /// Whether we should use SSE3 instructions where possible.
+    zune_use_sse3:                bool,
+    /// Whether we should use sse4.1 instructions where possible.
+    zune_use_sse41:               bool,
+    /// Whether we should use avx instructions where possible.
+    zune_use_avx:                 bool,
+    /// Whether we should use avx2 instructions where possible.
+    zune_use_avx2:                bool,
+    /// Whether the png decoder should add alpha channel where possible.
+    png_add_alpha_channel:        bool,
+    /// Whether we should use neon instructions where possible.
+    zune_use_neon:                bool,
+    /// Whether the png decoder should strip 16 bit to 8 bit
+    png_strip_16_bit_to_8_bit:    bool,
+    /// Decode all frames for an animated images
+    png_decode_animated:          bool,
+    jxl_decode_animated:          bool
+}
+
+/// Decoder options
+///
+/// Not all options are respected by decoders all decoders
+#[derive(Debug, Copy, Clone)]
+pub struct DecoderOptions {
+    /// Maximum width for which decoders will
+    /// not try to decode images larger than
+    /// the specified width.
+    ///
+    /// - Default value: 16384
+    /// - Respected by: `all decoders`
+    max_width:      usize,
+    /// Maximum height for which decoders will not
+    /// try to decode images larger than the
+    /// specified height
+    ///
+    /// - Default value: 16384
+    /// - Respected by: `all decoders`
+    max_height:     usize,
+    /// Output colorspace
+    ///
+    /// The jpeg decoder allows conversion to a separate colorspace
+    /// than the input.
+    ///
+    /// I.e you can convert a RGB jpeg image to grayscale without
+    /// first decoding it to RGB to get
+    ///
+    /// - Default value: `ColorSpace::RGB`
+    /// - Respected by: `jpeg`
+    out_colorspace: ColorSpace,
+
+    /// Maximum number of scans allowed
+    /// for progressive jpeg images
+    ///
+    /// Progressive jpegs have scans
+    ///
+    /// - Default value:100
+    /// - Respected by: `jpeg`
+    max_scans:     usize,
+    /// Maximum size for deflate.
+    /// Respected by all decoders that use inflate/deflate
+    deflate_limit: usize,
+    /// Boolean flags that influence decoding
+    flags:         DecoderFlags,
+    /// The byte endian of the returned bytes will be stored in
+    /// in case a single pixel spans more than a byte
+    endianness:    ByteEndian
+}
+
+/// Initializers
+impl DecoderOptions {
+    /// Create the decoder with options  setting most configurable
+    /// options to be their safe counterparts
+    ///
+    /// This is the same as `default` option as default initializes
+    /// options to the  safe variant.
+    ///
+    /// Note, decoders running on this will be slower as it disables
+    /// platform specific intrinsics
+    pub fn new_safe() -> DecoderOptions {
+        DecoderOptions::default()
+    }
+
+    /// Create the decoder with options setting the configurable options
+    /// to the fast  counterparts
+    ///
+    /// This enables platform specific code paths and enable use of unsafe
+    pub fn new_fast() -> DecoderOptions {
+        let flag = fast_options();
+        DecoderOptions::default().set_decoder_flags(flag)
+    }
+
+    /// Create the decoder options with the following characteristics
+    ///
+    /// - Use unsafe paths.
+    /// - Ignore error checksuming, e.g in png we do not confirm adler and crc in this mode
+    /// - Enable fast intrinsics paths
+    pub fn new_cmd() -> DecoderOptions {
+        let flag = cmd_options();
+        DecoderOptions::default().set_decoder_flags(flag)
+    }
+}
+
+/// Global options respected by all decoders
+impl DecoderOptions {
+    /// Get maximum width configured for which the decoder
+    /// should not try to decode images greater than this width
+    pub const fn max_width(&self) -> usize {
+        self.max_width
+    }
+
+    /// Get maximum height configured for which the decoder should
+    /// not try to decode images greater than this height
+    pub const fn max_height(&self) -> usize {
+        self.max_height
+    }
+
+    /// Return true whether the decoder should be in strict mode
+    /// And reject most errors
+    pub fn strict_mode(&self) -> bool {
+        self.flags.jpg_error_on_non_conformance
+            | self.flags.png_confirm_crc
+            | self.flags.inflate_confirm_adler
+    }
+    /// Return true if the decoder should use unsafe
+    /// routines where possible
+    pub const fn use_unsafe(&self) -> bool {
+        self.flags.zune_use_unsafe
+    }
+
+    /// Set maximum width for which the decoder should not try
+    /// decoding images greater than that width
+    ///
+    /// # Arguments
+    ///
+    /// * `width`:  The maximum width allowed
+    ///
+    /// returns: DecoderOptions
+    pub fn set_max_width(mut self, width: usize) -> Self {
+        self.max_width = width;
+        self
+    }
+
+    /// Set maximum height for which the decoder should not try
+    /// decoding images greater than that height
+    /// # Arguments
+    ///
+    /// * `height`: The maximum height allowed
+    ///
+    /// returns: DecoderOptions
+    ///
+    pub fn set_max_height(mut self, height: usize) -> Self {
+        self.max_height = height;
+        self
+    }
+
+    /// Whether the routines can use unsafe platform specific
+    /// intrinsics when necessary
+    ///
+    /// Platform intrinsics are implemented for operations which
+    /// the compiler can't auto-vectorize, or we can do a marginably
+    /// better job at it
+    ///
+    /// All decoders with unsafe routines respect it.
+    ///
+    /// Treat this with caution, disabling it will cause slowdowns but
+    /// it's provided for mainly for debugging use.
+    ///
+    /// - Respected by: `png` and `jpeg`(decoders with unsafe routines)
+    pub fn set_use_unsafe(mut self, yes: bool) -> Self {
+        // first clear the flag
+        self.flags.zune_use_unsafe = yes;
+        self
+    }
+
+    fn set_decoder_flags(mut self, flags: DecoderFlags) -> Self {
+        self.flags = flags;
+        self
+    }
+    /// Set whether the decoder should be in standards conforming/
+    /// strict mode
+    ///
+    /// This reduces the error tolerance level for the decoders and invalid
+    /// samples will be rejected by the decoder
+    ///
+    /// # Arguments
+    ///
+    /// * `yes`:
+    ///
+    /// returns: DecoderOptions
+    ///
+    pub fn set_strict_mode(mut self, yes: bool) -> Self {
+        self.flags.jpg_error_on_non_conformance = yes;
+        self.flags.png_confirm_crc = yes;
+        self.flags.inflate_confirm_adler = yes;
+        self
+    }
+
+    /// Set the byte endian for which raw samples will be stored in
+    /// in case a single pixel sample spans more than a byte.
+    ///
+    /// The default is usually native endian hence big endian values
+    /// will be converted to little endian on little endian systems,
+    ///
+    /// and little endian values will be converted to big endian on big endian systems
+    ///
+    /// # Arguments
+    ///
+    /// * `endian`: The endianness to which to set the bytes to
+    ///
+    /// returns: DecoderOptions
+    pub fn set_byte_endian(mut self, endian: ByteEndian) -> Self {
+        self.endianness = endian;
+        self
+    }
+
+    /// Get the byte endian for which samples that span more than one byte will
+    /// be treated
+    pub const fn byte_endian(&self) -> ByteEndian {
+        self.endianness
+    }
+}
+
+/// PNG specific options
+impl DecoderOptions {
+    /// Whether the inflate decoder should confirm
+    /// adler checksums
+    pub const fn inflate_get_confirm_adler(&self) -> bool {
+        self.flags.inflate_confirm_adler
+    }
+    /// Set whether the inflate decoder should confirm
+    /// adler checksums
+    pub fn inflate_set_confirm_adler(mut self, yes: bool) -> Self {
+        self.flags.inflate_confirm_adler = yes;
+        self
+    }
+    /// Get default inflate limit for which the decoder
+    /// will not try to decompress further
+    pub const fn inflate_get_limit(&self) -> usize {
+        self.deflate_limit
+    }
+    /// Set the default inflate limit for which decompressors
+    /// relying on inflate won't surpass this limit
+    #[must_use]
+    pub fn inflate_set_limit(mut self, limit: usize) -> Self {
+        self.deflate_limit = limit;
+        self
+    }
+    /// Whether the inflate decoder should confirm
+    /// crc 32 checksums
+    pub const fn png_get_confirm_crc(&self) -> bool {
+        self.flags.png_confirm_crc
+    }
+    /// Set whether the png decoder should confirm
+    /// CRC 32 checksums
+    #[must_use]
+    pub fn png_set_confirm_crc(mut self, yes: bool) -> Self {
+        self.flags.png_confirm_crc = yes;
+        self
+    }
+    /// Set whether the png decoder should add an alpha channel to
+    /// images where possible.
+    ///
+    /// For Luma images, it converts it to Luma+Alpha
+    ///
+    /// For RGB images it converts it to RGB+Alpha
+    pub fn png_set_add_alpha_channel(mut self, yes: bool) -> Self {
+        self.flags.png_add_alpha_channel = yes;
+        self
+    }
+    /// Return true whether the png decoder should add an alpha
+    /// channel to images where possible
+    pub const fn png_get_add_alpha_channel(&self) -> bool {
+        self.flags.png_add_alpha_channel
+    }
+
+    /// Whether the png decoder should reduce 16 bit images to 8 bit
+    /// images implicitly.
+    ///
+    /// Equivalent to [png::Transformations::STRIP_16](https://docs.rs/png/latest/png/struct.Transformations.html#associatedconstant.STRIP_16)
+    pub fn png_set_strip_to_8bit(mut self, yes: bool) -> Self {
+        self.flags.png_strip_16_bit_to_8_bit = yes;
+        self
+    }
+
+    /// Return a boolean indicating whether the png decoder should reduce
+    /// 16 bit images to 8 bit images implicitly
+    pub const fn png_get_strip_to_8bit(&self) -> bool {
+        self.flags.png_strip_16_bit_to_8_bit
+    }
+
+    /// Return whether `zune-image` should decode animated images or
+    /// whether we should just decode the first frame only
+    pub const fn png_decode_animated(&self) -> bool {
+        self.flags.png_decode_animated
+    }
+    /// Set  whether `zune-image` should decode animated images or
+    /// whether we should just decode the first frame only
+    pub const fn png_set_decode_animated(mut self, yes: bool) -> Self {
+        self.flags.png_decode_animated = yes;
+        self
+    }
+}
+
+/// JPEG specific options
+impl DecoderOptions {
+    /// Get maximum scans for which the jpeg decoder
+    /// should not go above for progressive images
+    pub const fn jpeg_get_max_scans(&self) -> usize {
+        self.max_scans
+    }
+
+    /// Set maximum scans for which the jpeg decoder should
+    /// not exceed when reconstructing images.
+    pub fn jpeg_set_max_scans(mut self, max_scans: usize) -> Self {
+        self.max_scans = max_scans;
+        self
+    }
+    /// Get expected output colorspace set by the user for which the image
+    /// is expected to be reconstructed into.
+    ///
+    /// This may be different from the
+    pub const fn jpeg_get_out_colorspace(&self) -> ColorSpace {
+        self.out_colorspace
+    }
+    /// Set expected colorspace for which the jpeg output is expected to be in
+    ///
+    /// This is mainly provided as is, we do not guarantee the decoder can convert to all colorspaces
+    /// and the decoder can change it internally when it sees fit.
+    #[must_use]
+    pub fn jpeg_set_out_colorspace(mut self, colorspace: ColorSpace) -> Self {
+        self.out_colorspace = colorspace;
+        self
+    }
+}
+
+/// Intrinsics support
+///
+/// These routines are compiled depending
+/// on the platform they are used, if compiled for a platform
+/// it doesn't support,(e.g avx2 on Arm), it will always return `false`
+impl DecoderOptions {
+    /// Use SSE 2 code paths where possible
+    ///
+    /// This checks for existence of SSE2 first and returns
+    /// false if it's not present
+    #[allow(unreachable_code)]
+    pub fn use_sse2(&self) -> bool {
+        let opt = self.flags.zune_use_sse2 | self.flags.zune_use_unsafe;
+        // options says no
+        if !opt {
+            return false;
+        }
+
+        #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+        {
+            // where we can do runtime check if feature is present
+            #[cfg(feature = "std")]
+            {
+                if is_x86_feature_detected!("sse2") {
+                    return true;
+                }
+            }
+            // where we can't do runtime check if feature is present
+            // check if the compile feature had it enabled
+            #[cfg(all(not(feature = "std"), target_feature = "sse2"))]
+            {
+                return true;
+            }
+        }
+        // everything failed return false
+        false
+    }
+
+    /// Use SSE 3 paths where possible
+    ///
+    ///
+    /// This also checks for SSE3 support and returns false if
+    /// it's not present
+    #[allow(unreachable_code)]
+    pub fn use_sse3(&self) -> bool {
+        let opt = self.flags.zune_use_sse3 | self.flags.zune_use_unsafe;
+        // options says no
+        if !opt {
+            return false;
+        }
+
+        #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+        {
+            // where we can do runtime check if feature is present
+            #[cfg(feature = "std")]
+            {
+                if is_x86_feature_detected!("sse3") {
+                    return true;
+                }
+            }
+            // where we can't do runtime check if feature is present
+            // check if the compile feature had it enabled
+            #[cfg(all(not(feature = "std"), target_feature = "sse3"))]
+            {
+                return true;
+            }
+        }
+        // everything failed return false
+        false
+    }
+
+    /// Use SSE4 paths where possible
+    ///
+    /// This also checks for sse 4.1 support and returns false if it
+    /// is not present
+    #[allow(unreachable_code)]
+    pub fn use_sse41(&self) -> bool {
+        let opt = self.flags.zune_use_sse41 | self.flags.zune_use_unsafe;
+        // options says no
+        if !opt {
+            return false;
+        }
+
+        #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+        {
+            // where we can do runtime check if feature is present
+            #[cfg(feature = "std")]
+            {
+                if is_x86_feature_detected!("sse4.1") {
+                    return true;
+                }
+            }
+            // where we can't do runtime check if feature is present
+            // check if the compile feature had it enabled
+            #[cfg(all(not(feature = "std"), target_feature = "sse4.1"))]
+            {
+                return true;
+            }
+        }
+        // everything failed return false
+        false
+    }
+
+    /// Use AVX paths where possible
+    ///
+    /// This also checks for AVX support and returns false if it's
+    /// not present
+    #[allow(unreachable_code)]
+    pub fn use_avx(&self) -> bool {
+        let opt = self.flags.zune_use_avx | self.flags.zune_use_unsafe;
+        // options says no
+        if !opt {
+            return false;
+        }
+
+        #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+        {
+            // where we can do runtime check if feature is present
+            #[cfg(feature = "std")]
+            {
+                if is_x86_feature_detected!("avx") {
+                    return true;
+                }
+            }
+            // where we can't do runitme check if feature is present
+            // check if the compile feature had it enabled
+            #[cfg(all(not(feature = "std"), target_feature = "avx"))]
+            {
+                return true;
+            }
+        }
+        // everything failed return false
+        false
+    }
+
+    /// Use avx2 paths where possible
+    ///
+    /// This also checks for AVX2 support and returns false if it's not
+    /// present
+    #[allow(unreachable_code)]
+    pub fn use_avx2(&self) -> bool {
+        let opt = self.flags.zune_use_avx2 | self.flags.zune_use_unsafe;
+        // options says no
+        if !opt {
+            return false;
+        }
+
+        #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+        {
+            // where we can do runtime check if feature is present
+            #[cfg(feature = "std")]
+            {
+                if is_x86_feature_detected!("avx2") {
+                    return true;
+                }
+            }
+            // where we can't do runitme check if feature is present
+            // check if the compile feature had it enabled
+            #[cfg(all(not(feature = "std"), target_feature = "avx2"))]
+            {
+                return true;
+            }
+        }
+        // everything failed return false
+        false
+    }
+
+    #[allow(unreachable_code)]
+    pub fn use_neon(&self) -> bool {
+        let opt = self.flags.zune_use_neon | self.flags.zune_use_unsafe;
+        // options says no
+        if !opt {
+            return false;
+        }
+
+        #[cfg(target_arch = "aarch64")]
+        {
+            // aarch64 implies neon on a compliant cpu
+            // but for real prod should do something better here
+            return true;
+        }
+        // everything failed return false
+        false
+    }
+}
+
+/// JPEG_XL specific options
+impl DecoderOptions {
+    /// Return whether `zune-image` should decode animated images or
+    /// whether we should just decode the first frame only
+    pub const fn jxl_decode_animated(&self) -> bool {
+        self.flags.jxl_decode_animated
+    }
+    /// Set  whether `zune-image` should decode animated images or
+    /// whether we should just decode the first frame only
+    pub const fn jxl_set_decode_animated(mut self, yes: bool) -> Self {
+        self.flags.jxl_decode_animated = yes;
+        self
+    }
+}
+impl Default for DecoderOptions {
+    /// Create a default and sane option for decoders
+    ///
+    /// The following are the defaults
+    ///
+    /// - All decoders
+    ///     - max_width: 16536
+    ///     - max_height: 16535
+    ///     - use_unsafe: Use unsafe intrinsics where possible.
+    ///
+    /// - JPEG
+    ///     - max_scans: 100 (progressive images only, artificial cap to prevent a specific DOS)
+    ///     - error_on_non_conformance: False (slightly corrupt images will be allowed)
+    /// - DEFLATE
+    ///     - deflate_limit: 1GB (will not continue decoding deflate archives larger than this)
+    /// - PNG
+    ///   - endianness: Default endianess is Big Endian when decoding 16 bit images to be viewed as 8 byte images
+    ///   - confirm_crc: False (CRC will not be confirmed to be safe)
+    ///   - strip_16_bit_to_8: False, 16 bit images are handled as 16 bit images
+    ///   - add alpha: False, alpha channel is not added where it isn't present
+    ///   - decode_animated: True: All frames in an animated image are decoded
+    ///
+    ///  - JXL
+    ///    - decode_animated: True: All frames in an animated image are decoded
+    ///
+    fn default() -> Self {
+        Self {
+            out_colorspace: ColorSpace::RGB,
+            max_width:      1 << 14,
+            max_height:     1 << 14,
+            max_scans:      100,
+            deflate_limit:  1 << 30,
+            flags:          decoder_error_tolerance_mode(),
+            endianness:     ByteEndian::BE
+        }
+    }
+}
--- a/third_party/zune-core/src/options/encoder.rs
+++ b/third_party/zune-core/src/options/encoder.rs
@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+use crate::bit_depth::BitDepth;
+use crate::colorspace::ColorSpace;
+
+/// Encoder options that are flags
+#[derive(Copy, Debug, Clone, Default)]
+struct EncoderFlags {
+    /// Whether JPEG images should be encoded as progressive images
+    jpeg_encode_progressive: bool,
+    /// Whether JPEG images should use optimized huffman tables
+    jpeg_optimize_huffman:   bool,
+    /// Whether to not preserve metadata across image transformations
+    image_strip_metadata:    bool
+}
+
+/// Options shared by some of the encoders in
+/// the `zune-` family of image crates
+#[derive(Debug, Copy, Clone)]
+pub struct EncoderOptions {
+    width:       usize,
+    height:      usize,
+    colorspace:  ColorSpace,
+    quality:     u8,
+    depth:       BitDepth,
+    num_threads: u8,
+    effort:      u8,
+    flags:       EncoderFlags
+}
+
+impl Default for EncoderOptions {
+    fn default() -> Self {
+        Self {
+            width:       0,
+            height:      0,
+            colorspace:  ColorSpace::RGB,
+            quality:     80,
+            depth:       BitDepth::Eight,
+            num_threads: 4,
+            effort:      4,
+            flags:       EncoderFlags::default()
+        }
+    }
+}
+
+impl EncoderOptions {
+    ///  Create  new encode options
+    ///
+    /// # Arguments
+    ///  
+    /// * `width`: Image width
+    /// * `height`: Image height
+    /// * `colorspace`:  Image colorspaces
+    /// * `depth`: Image depth
+    ///
+    /// returns: EncoderOptions
+    ///
+    pub fn new(
+        width: usize, height: usize, colorspace: ColorSpace, depth: BitDepth
+    ) -> EncoderOptions {
+        EncoderOptions {
+            width,
+            height,
+            colorspace,
+            depth,
+            ..Default::default()
+        }
+    }
+    /// Get the width for which the image will be encoded in
+    pub const fn width(&self) -> usize {
+        self.width
+    }
+
+    /// Get height for which the image will be encoded in
+    ///
+    /// returns: usize
+    ///
+    /// # Panics
+    /// If height is zero
+    pub fn height(&self) -> usize {
+        assert_ne!(self.height, 0);
+        self.height
+    }
+    /// Get the depth for which the image will be encoded in
+    pub const fn depth(&self) -> BitDepth {
+        self.depth
+    }
+    /// Get the quality for which the image will be encoded with
+    ///
+    ///  # Lossy
+    /// - Higher quality means some images take longer to write and
+    /// are big but they look good
+    ///
+    /// - Lower quality means small images and low quality.
+    ///
+    /// # Lossless
+    /// - High quality indicates more time is spent in making the file
+    /// smaller
+    ///
+    /// - Low quality indicates less time is spent in making the file bigger
+    pub const fn quality(&self) -> u8 {
+        self.quality
+    }
+    /// Get the colorspace for which the image will be encoded in
+    pub const fn colorspace(&self) -> ColorSpace {
+        self.colorspace
+    }
+    pub const fn effort(&self) -> u8 {
+        self.effort
+    }
+
+    /// Set width for the image to be encoded
+    pub fn set_width(mut self, width: usize) -> Self {
+        self.width = width;
+        self
+    }
+
+    /// Set height for the image to be encoded
+    pub fn set_height(mut self, height: usize) -> Self {
+        self.height = height;
+        self
+    }
+    /// Set depth for the image to be encoded
+    pub fn set_depth(mut self, depth: BitDepth) -> Self {
+        self.depth = depth;
+        self
+    }
+    /// Set quality of the image to be encoded
+    ///
+    /// Quality is clamped from 0..100
+    ///
+    /// Quality means different options depending on the encoder, see
+    /// [get_quality](Self::quality)
+    pub fn set_quality(mut self, quality: u8) -> Self {
+        self.quality = quality.clamp(0, 100);
+        self
+    }
+    /// Set colorspace for the image to be encoded
+    pub fn set_colorspace(mut self, colorspace: ColorSpace) -> Self {
+        self.colorspace = colorspace;
+        self
+    }
+    /// Set the number of threads allowed for multithreaded encoding
+    /// where supported
+    ///
+    /// Zero means use a single thread
+    pub fn set_num_threads(mut self, threads: u8) -> Self {
+        self.num_threads = threads;
+
+        self
+    }
+    pub fn set_effort(mut self, effort: u8) -> Self {
+        self.effort = effort;
+        self
+    }
+
+    /// Return number of threads configured for multithreading
+    /// where possible
+    ///
+    /// This is used for multi-threaded encoders,
+    /// currently only jpeg-xl
+    pub const fn num_threads(&self) -> u8 {
+        self.num_threads
+    }
+
+    /// Set whether the encoder should remove metadata from the image
+    ///
+    /// When set to `true`, supported encoders will strip away metadata
+    /// from the resulting image. If set to false, where supported, encoders
+    /// will not remove metadata from images
+    pub fn set_strip_metadata(mut self, yes: bool) -> Self {
+        self.flags.image_strip_metadata = yes;
+        self
+    }
+    /// Whether or not the encoder should remove metadata from the image
+    ///
+    /// The default value is false, and encoders that respect this try to preserve as much
+    /// data as possible from one image to another
+    pub const fn strip_metadata(&self) -> bool {
+        !self.flags.image_strip_metadata
+    }
+}
+
+/// JPEG options
+impl EncoderOptions {
+    /// Whether the jpeg encoder should encode the image in progressive mode
+    ///
+    /// Default is `false`.
+    ///
+    /// This may be used to create slightly smaller images at the cost of more processing
+    /// time
+    pub const fn jpeg_encode_progressive(&self) -> bool {
+        self.flags.jpeg_encode_progressive
+    }
+
+    /// Whether the jpeg encoder should optimize huffman tables to create smaller files
+    /// at the cost of processing time
+    ///
+    /// Default is `false`.
+    pub const fn jpeg_optimized_huffman_tables(&self) -> bool {
+        self.flags.jpeg_optimize_huffman
+    }
+
+    /// Set whether the jpeg encoder should encode the imagei in progressive mode
+    ///
+    /// Default is `false`
+    pub fn set_jpeg_encode_progressive(mut self, yes: bool) -> Self {
+        self.flags.jpeg_optimize_huffman = yes;
+        self
+    }
+}
--- a/third_party/zune-core/src/result.rs
+++ b/third_party/zune-core/src/result.rs
@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+//! Decoding results for images
+use alloc::vec::Vec;
+
+/// A simple enum that can hold decode
+/// results of most images
+#[non_exhaustive]
+pub enum DecodingResult {
+    U8(Vec<u8>),
+    U16(Vec<u16>),
+    F32(Vec<f32>)
+}
+
+impl DecodingResult {
+    /// Return the contents if the enum stores `Vec<u8>` or otherwise
+    /// return `None`.
+    ///
+    /// Useful for de-sugaring the result of a decoding operation
+    /// into raw bytes
+    ///
+    /// # Example
+    /// ```
+    /// use zune_core::result::DecodingResult;
+    /// let data = DecodingResult::U8(vec![0;100]);
+    /// // we know this won't fail because we created it with u8
+    /// assert!(data.u8().is_some());
+    ///
+    /// let data = DecodingResult::U16(vec![0;100]);
+    /// // it should now return nothing since the type is u18
+    /// assert!(data.u8().is_none());
+    ///
+    /// ```
+    pub fn u8(self) -> Option<Vec<u8>> {
+        match self {
+            DecodingResult::U8(data) => Some(data),
+            _ => None
+        }
+    }
+
+    /// Return the contents if the enum stores `Vec<u16>` or otherwise
+    /// return `None`.
+    ///
+    /// Useful for de-sugaring the result of a decoding operation
+    /// into raw bytes
+    ///
+    /// # Example
+    /// ```
+    /// use zune_core::result::DecodingResult;
+    /// let data = DecodingResult::U8(vec![0;100]);
+    /// // we know this will fail because we created it with u16
+    /// assert!(data.u16().is_none());
+    ///
+    ///
+    /// let data = DecodingResult::U16(vec![0;100]);
+    /// // it should now return something since the type is u16
+    /// assert!(data.u16().is_some());
+    ///
+    /// ```
+    pub fn u16(self) -> Option<Vec<u16>> {
+        match self {
+            DecodingResult::U16(data) => Some(data),
+            _ => None
+        }
+    }
+}
--- a/third_party/zune-core/src/serde.rs
+++ b/third_party/zune-core/src/serde.rs
@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+#![cfg(feature = "serde")]
+//! Serde support for serializing
+//! crate datastructures
+//!
+//! Implements serialize for
+//!  - ColorSpace
+//!  - BitDepth
+//!  - ColorCharacteristics
+use alloc::format;
+
+use serde::ser::*;
+
+use crate::bit_depth::BitDepth;
+use crate::colorspace::{ColorCharacteristics, ColorSpace, RenderingIntent};
+
+impl Serialize for ColorSpace {
+    #[allow(clippy::uninlined_format_args)]
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer
+    {
+        // colorspace serialization is simply it's debug value
+        serializer.serialize_str(&format!("{:?}", self))
+    }
+}
+
+impl Serialize for BitDepth {
+    #[allow(clippy::uninlined_format_args)]
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer
+    {
+        serializer.serialize_str(&format!("{:?}", self))
+    }
+}
+
+impl Serialize for ColorCharacteristics {
+    #[allow(clippy::uninlined_format_args)]
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer
+    {
+        serializer.serialize_str(&format!("{:?}", self))
+    }
+}
+
+impl Serialize for RenderingIntent {
+    #[allow(clippy::uninlined_format_args)]
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer
+    {
+        serializer.serialize_str(&format!("{:?}", self))
+    }
+}
--- a/third_party/zune-jpeg/.gitignore
+++ b/third_party/zune-jpeg/.gitignore
@ -0,0 +1 @@
+/target
--- a/third_party/zune-jpeg/Benches.md
+++ b/third_party/zune-jpeg/Benches.md
@ -0,0 +1,79 @@
+# Benchmarks of popular jpeg libraries
+
+Here I compare how long it takes popular JPEG decoders to decode the below 7680*4320 image
+of (now defunct ?) [Cutefish OS](https://en.cutefishos.com/) default wallpaper.
+![img](benches/images/speed_bench.jpg)
+
+## About benchmarks
+
+Benchmarks are weird, especially IO & multi-threaded programs. This library uses both of the above hence performance may
+vary.
+
+For best results shut down your machine, go take coffee, think about life and how it came to be and why people should
+save the environment.
+
+Then power up your machine, if it's a laptop connect it to a power supply and if there is a setting for performance
+mode, tweak it.
+
+Then run.
+
+## Benchmarks vs real world usage
+
+Real world usage may vary.
+
+Notice that I'm using a large image but probably most decoding will be small to medium images.
+
+To make the library thread safe, we do about 1.5-1.7x more allocations than libjpeg-turbo. Although, do note that the
+allocations do not occur at ago, we allocate when needed and deallocate when not needed.
+
+Do note if memory bandwidth is a limitation. This is not for you.
+
+## Reproducibility
+
+The benchmarks are carried out on my local machine with an AMD Ryzen 5 4500u
+
+The benchmarks are reproducible.
+
+To reproduce them
+
+1. Clone this repository
+2. Install rust(if you don't have it yet)
+3. `cd` into the directory.
+4. Run `cargo bench`
+
+## Performance features of the three libraries
+
+| feature                      | image-rs/jpeg-decoder | libjpeg-turbo | zune-jpeg |
+|------------------------------|-----------------------|---------------|-----------|
+| multithreaded                | ✅                     | ❌             | ❌         |
+| platform specific intrinsics | ✅                     | ✅             | ✅         |
+
+- Image-rs/jpeg-decoder uses [rayon] under the hood but it's under a feature
+  flag.
+
+- libjpeg-turbo uses hand-written asm for platform specific intrinsics, ported to
+  the most common architectures out there but falls back to scalar
+  code if it can't run in a platform.
+
+# Finally benchmarks
+
+[here]
+
+## Notes
+
+Benchmarks are ran at least once a week to catch regressions early and
+are uploaded to Github pages.
+
+Machine specs can be found on the other [landing page]
+
+Benchmarks may not reflect real world usage(threads, other I/O machine bottlenecks)
+
+[landing page]:https://etemesi254.github.io/posts/Zune-Benchmarks/
+
+[here]:https://etemesi254.github.io/assets/criterion/report/index.html
+
+[libjpeg-turbo]:https://github.com/libjpeg-turbo/libjpeg-turbo
+
+[jpeg-decoder]:https://github.com/image-rs/jpeg-decoder
+
+[rayon]:https://github.com/rayon-rs/rayon
--- a/third_party/zune-jpeg/Cargo.toml
+++ b/third_party/zune-jpeg/Cargo.toml
@ -0,0 +1,26 @@
+[package]
+name = "zune-jpeg"
+version = "0.5.0-rc1"
+authors = ["caleb <etemesicaleb@gmail.com>"]
+edition = "2021"
+repository = "https://github.com/etemesi254/zune-image/tree/dev/crates/zune-jpeg"
+license = "MIT OR Apache-2.0 OR Zlib"
+keywords = ["jpeg", "jpeg-decoder", "decoder"]
+categories = ["multimedia::images"]
+exclude = ["/benches/images/*", "/tests/*", "/.idea/*", "/.gradle/*", "/test-images/*", "fuzz/*"]
+description = "A fast, correct and safe jpeg decoder"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+[features]
+x86 = []
+neon = []
+std = ["zune-core/std"]
+log = ["zune-core/log"]
+default = ["x86", "neon", "std"]
+
+[dependencies]
+zune-core = { path = "../zune-core", version = "^0.5.0-rc1" }
+
+
+[dev-dependencies]
+zune-ppm = { path = "../zune-ppm" }
--- a/third_party/zune-jpeg/Changelog.md
+++ b/third_party/zune-jpeg/Changelog.md
@ -0,0 +1,64 @@
+## Version 0.3.17
+
+- Fix no-std compilation
+
+## Version 0.3.16
+
+- Add support for decoding to BGR and BGRA
+
+## Version 0.3.14
+
+- Add ability to parse exif and ICC chunk.
+- Fix images with one component that were down-sampled.
+
+### Version 0.3.13
+
+- Allow decoding into pre-allocated buffer
+- Clarify documentation
+
+### Version 0.3.11
+
+- Add guards for SSE and AVX code paths(allows compiling for platforms that do not support it)
+
+### Version 0.3.0
+
+- Overhaul to the whole decoder.
+- Single threaded version
+- Lightweight.
+
+### Version 0.2.0
+
+- New `ZuneJpegOptions` struct, this is the now recommended way to set up decoding options for
+  decoding
+- Deprecated previous options setting functions.
+- More code cleanups
+- Fixed new bugs discovered by fuzzing
+- Removed dependency on `num_cpu`
+
+### Version 0.1.5
+- Allow user to set memory limits in during decoding explicitly via `set_limits`
+- Fixed some bugs discovered by fuzzing
+- Correctly handle small images less than 16 pixels
+- Gracefully handle incorrectly sampled images.
+
+### Version 0.1.4
+- Remove all `unsafe` instances except platform dependent intrinsics.
+- Numerous bug fixes identified by fuzzing.
+- Expose `ImageInfo` to the crate root.
+
+### Version 0.1.3
+- Fix numerous panics found by fuzzing(thanks to @[Shnatsel] for the corpus)
+- Add new method `set_num_threads` that allows one to explicitly set the number of threads to use to decode the image.
+
+### Version 0.1.2
+- Add more sub checks, contributed by @[5225225]
+- Privatize some modules.
+
+### Version 0.1.1
+- Fix rgba/rgbx decoding when avx optimized functions were used
+- Initial support for fuzzing 
+- Remove `align_alloc` method which was unsound (Thanks to @[HeroicKatora] for pointing that out)
+
+[Shnatsel]:https://github.com/Shnatsel
+[HeroicKatora]:https://github.com/HeroicKatora
+[5225225]:https://github.com/5225225
--- a/third_party/zune-jpeg/LICENSE-APACHE
+++ b/third_party/zune-jpeg/LICENSE-APACHE
@ -0,0 +1 @@
+../../LICENSE-APACHE
--- a/third_party/zune-jpeg/LICENSE-MIT
+++ b/third_party/zune-jpeg/LICENSE-MIT
@ -0,0 +1 @@
+../../LICENSE-MIT
--- a/third_party/zune-jpeg/LICENSE-ZLIB
+++ b/third_party/zune-jpeg/LICENSE-ZLIB
@ -0,0 +1 @@
+../../LICENSE-ZLIB
--- a/third_party/zune-jpeg/README.md
+++ b/third_party/zune-jpeg/README.md
@ -0,0 +1,104 @@
+# Zune-JPEG
+
+A fast, correct and safe jpeg decoder in pure Rust.
+
+## Usage
+
+The library provides a simple-to-use API for jpeg decoding
+and an ability to add options to influence decoding.
+
+### Example
+
+```Rust
+// Import the library
+use zune_jpeg::JpegDecoder;
+use std::fs::read;
+
+fn main()->Result<(),DecoderErrors> {
+    // load some jpeg data
+    let data = read("cat.jpg").unwrap();
+    // create a decoder
+    let mut decoder = JpegDecoder::new(&data);
+    // decode the file
+    let pixels = decoder.decode()?;
+}
+```
+
+The decoder supports more manipulations via `DecoderOptions`,
+see additional documentation in the library.
+
+## Goals
+
+The implementation aims to have the following goals achieved,
+in order of importance
+
+1. Safety - Do not segfault on errors or invalid input. Panics are okay, but
+   should be fixed when reported. `unsafe` is only used for SIMD intrinsics,
+   and can be turned off entirely both at compile time and at runtime.
+2. Speed - Get the data as quickly as possible, which means
+    1. Platform intrinsics code where justifiable
+    2. Carefully written platform independent code that allows the
+       compiler to vectorize it.
+    3. Regression tests.
+    4. Watch the memory usage of the program
+3. Usability - Provide utility functions like different color conversions functions.
+
+## Non-Goals
+
+- Bit identical results with libjpeg/libjpeg-turbo will never be an aim of this library.
+  Jpeg is a lossy format with very few parts specified by the standard
+  (i.e it doesn't give a reference upsampling and color conversion algorithm)
+
+## Features
+
+- [x] A Pretty fast 8*8 integer IDCT.
+- [x] Fast Huffman Decoding
+- [x] Fast color convert functions.
+- [x] Support for extended colorspaces like GrayScale and RGBA
+- [X] Single-threaded decoding.
+- [X] Support for four component JPEGs, and esoteric color schemes like CYMK
+- [X] Support for `no_std`
+- [X] BGR/BGRA decoding support.
+
+## Crate Features
+
+| feature | on  | Capabilities                                                                                |
+|---------|-----|---------------------------------------------------------------------------------------------|
+| `x86`   | yes | Enables `x86` specific instructions, specifically `avx` and `sse` for accelerated decoding. |
+| `std`   | yes | Enable linking to the `std` crate                                                           |
+
+Note that the `x86` features are automatically disabled on platforms that aren't x86 during compile
+time hence there is no need to disable them explicitly if you are targeting such a platform.
+
+## Using in a `no_std` environment
+
+The crate can be used in a `no_std` environment with the `alloc` feature.
+
+But one is required to link to a working allocator for whatever environment the decoder
+will be running on
+
+## Debug vs release
+
+The decoder heavily relies on platform specific intrinsics, namely AVX2 and SSE to gain speed-ups in decoding,
+but they [perform poorly](https://godbolt.org/z/vPq57z13b) in debug builds. To get reasonable performance even
+when compiling your program in debug mode, add this to your `Cargo.toml`:
+
+```toml
+# `zune-jpeg` package will be always built with optimizations
+[profile.dev.package.zune-jpeg]
+opt-level = 3
+```
+
+## Benchmarks
+
+The library tries to be at fast as [libjpeg-turbo] while being as safe as possible.
+Platform specific intrinsics help get speed up intensive operations ensuring we can almost
+match [libjpeg-turbo] speeds but speeds are always +- 10 ms of this library.
+
+For more up-to-date benchmarks, see the online repo with
+benchmarks [here](https://etemesi254.github.io/assets/criterion/report/index.html)
+
+
+[libjpeg-turbo]:https://github.com/libjpeg-turbo/libjpeg-turbo/
+
+[image-rs/jpeg-decoder]:https://github.com/image-rs/jpeg-decoder/tree/master/src
--- a/third_party/zune-jpeg/fuzz/.gitignore
+++ b/third_party/zune-jpeg/fuzz/.gitignore
@ -0,0 +1,3 @@
+target
+corpus
+artifacts
--- a/third_party/zune-jpeg/fuzz/Cargo.toml
+++ b/third_party/zune-jpeg/fuzz/Cargo.toml
@ -0,0 +1,32 @@
+[package]
+name = "zune-jpeg-fuzz"
+version = "0.0.0"
+authors = ["Automatically generated"]
+publish = false
+edition = "2018"
+
+[package.metadata]
+cargo-fuzz = true
+
+[dependencies]
+libfuzzer-sys = "0.4"
+
+[dependencies.zune-jpeg]
+path = ".."
+features =  ["neon", "x86"]
+
+# Prevent this from interfering with workspaces
+[workspace]
+members = ["."]
+
+[[bin]]
+name = "decode_buffer"
+path = "fuzz_targets/decode_buffer.rs"
+test = false
+doc = false
+
+[[bin]]
+name = "fuzz_idct"
+path = "fuzz_targets/fuzz_idct.rs"
+test = false
+doc = false
--- a/third_party/zune-jpeg/fuzz/fuzz_targets/decode_buffer.rs
+++ b/third_party/zune-jpeg/fuzz/fuzz_targets/decode_buffer.rs
@ -0,0 +1,10 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+
+fuzz_target!(|data: &[u8]| {
+    use zune_jpeg::zune_core::bytestream::ZCursor;
+    let data = ZCursor::new(data);
+    let mut decoder = zune_jpeg::JpegDecoder::new(data);
+    let _ = decoder.decode();
+});
--- a/third_party/zune-jpeg/fuzz/fuzz_targets/fuzz_idct.rs
+++ b/third_party/zune-jpeg/fuzz/fuzz_targets/fuzz_idct.rs
@ -0,0 +1,47 @@
+#![no_main]
+use libfuzzer_sys::fuzz_target;
+use zune_jpeg::idct::scalar::idct_int;
+
+fuzz_target!(|data: [i32; 64]| {
+    let mut data = data;
+
+    // keep in some relatively sane range
+    // to prevent scalar overflows
+    for d in &mut data
+    {
+        let bound = 255;
+        *d = (*d).min(bound).max(-bound);
+    }
+    let mut data_vec = data;
+    // this is way too big but it shouldn't matter
+    // scalar and vector should mutate the minimum needed
+
+    let mut output_scalar = [0i16; 64];
+    let mut output_vector = [0i16; 64];
+
+    let _must_use_supported_vector_arch;
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[cfg(target_feature = "avx2")]
+    {
+        use zune_jpeg::idct::avx2::idct_avx2;
+        idct_avx2(&mut data_vec, &mut output_vector, 8);
+        _must_use_supported_vector_arch = true;
+    }
+
+    #[cfg(target_arch = "aarch64")]
+    {
+        use zune_jpeg::idct::neon::idct_neon;
+        idct_neon(&mut data_vec, &mut output_vector, 8);
+        _must_use_supported_vector_arch = true;
+    }
+
+    if _must_use_supported_vector_arch
+    {
+        idct_int(&mut data, &mut output_scalar, 8);
+        assert_eq!(output_scalar, output_vector, "IDCT and scalar do not match");
+    }
+    else
+    {
+        panic!("No vector IDCT ran!")
+    }
+});
--- a/third_party/zune-jpeg/src/bitstream.rs
+++ b/third_party/zune-jpeg/src/bitstream.rs
@ -0,0 +1,671 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+#![allow(
+    clippy::if_not_else,
+    clippy::similar_names,
+    clippy::inline_always,
+    clippy::doc_markdown,
+    clippy::cast_sign_loss,
+    clippy::cast_possible_truncation
+)]
+
+//! This file exposes a single struct that can decode a huffman encoded
+//! Bitstream in a JPEG file
+//!
+//! This code is optimized for speed.
+//! It's meant to be super duper super fast, because everyone else depends on this being fast.
+//! It's (annoyingly) serial hence we cant use parallel bitstreams(it's variable length coding.)
+//!
+//! Furthermore, on the case of refills, we have to do bytewise processing because the standard decided
+//! that we want to support markers in the middle of streams(seriously few people use RST markers).
+//!
+//! So we pull in all optimization steps:
+//! - use `inline[always]`? ✅ ,
+//! - pre-execute most common cases ✅,
+//! - add random comments ✅
+//! -  fast paths ✅.
+//!
+//! Speed-wise: It is probably the fastest JPEG BitStream decoder to ever sail the seven seas because of
+//! a couple of optimization tricks.
+//! 1. Fast refills from libjpeg-turbo
+//! 2. As few as possible branches in decoder fast paths.
+//! 3. Accelerated AC table decoding borrowed from stb_image.h written by Fabian Gissen (@ rygorous),
+//! improved by me to handle more cases.
+//! 4. Safe and extensible routines(e.g. cool ways to eliminate bounds check)
+//! 5. No unsafe here
+//!
+//! Readability comes as a second priority(I tried with variable names this time, and we are wayy better than libjpeg).
+//!
+//! Anyway if you are reading this it means your cool and I hope you get whatever part of the code you are looking for
+//! (or learn something cool)
+//!
+//! Knock yourself out.
+use alloc::format;
+use alloc::string::ToString;
+use core::cmp::min;
+
+use zune_core::bytestream::{ZByteReaderTrait, ZReader};
+
+use crate::errors::DecodeErrors;
+use crate::huffman::{HuffmanTable, HUFF_LOOKAHEAD};
+use crate::marker::Marker;
+use crate::mcu::DCT_BLOCK;
+use crate::misc::UN_ZIGZAG;
+
+macro_rules! decode_huff {
+    ($stream:tt,$symbol:tt,$table:tt) => {
+        let mut code_length = $symbol >> HUFF_LOOKAHEAD;
+
+        ($symbol) &= (1 << HUFF_LOOKAHEAD) - 1;
+
+        if code_length > i32::from(HUFF_LOOKAHEAD)
+        {
+            // if the symbol cannot be resolved in the first HUFF_LOOKAHEAD bits,
+            // we know it lies somewhere between HUFF_LOOKAHEAD and 16 bits since jpeg imposes 16 bit
+            // limit, we can therefore look 16 bits ahead and try to resolve the symbol
+            // starting from 1+HUFF_LOOKAHEAD bits.
+            $symbol = ($stream).peek_bits::<16>() as i32;
+            // (Credits to Sean T. Barrett stb library for this optimization)
+            // maxcode is pre-shifted 16 bytes long so that it has (16-code_length)
+            // zeroes at the end hence we do not need to shift in the inner loop.
+            while code_length < 17{
+                if $symbol < $table.maxcode[code_length as usize]  {
+                    break;
+                }
+                code_length += 1;
+            }
+
+            if code_length == 17{
+                // symbol could not be decoded.
+                //
+                // We may think, lets fake zeroes, noo
+                // panic, because Huffman codes are sensitive, probably everything
+                // after this will be corrupt, so no need to continue.
+                return Err(DecodeErrors::Format(format!("Bad Huffman Code 0x{:X}, corrupt JPEG",$symbol)))
+            }
+
+            $symbol >>= (16-code_length);
+            ($symbol) = i32::from(
+                ($table).values
+                    [(($symbol + ($table).offset[code_length as usize]) & 0xFF) as usize],
+            );
+        }
+        // drop bits read
+        ($stream).drop_bits(code_length as u8);
+    };
+}
+
+/// A `BitStream` struct, a bit by bit reader with super powers
+///
+pub(crate) struct BitStream {
+    /// A MSB type buffer that is used for some certain operations
+    pub buffer:           u64,
+    /// A TOP  aligned MSB type buffer that is used to accelerate some operations like
+    /// peek_bits and get_bits.
+    ///
+    /// By top aligned, I mean the top bit (63) represents the top bit in the buffer.
+    aligned_buffer:       u64,
+    /// Tell us the bits left the two buffer
+    pub(crate) bits_left: u8,
+    /// Did we find a marker(RST/EOF) during decoding?
+    pub marker:           Option<Marker>,
+
+    /// Progressive decoding
+    pub successive_high: u8,
+    pub successive_low:  u8,
+    spec_start:          u8,
+    spec_end:            u8,
+    pub eob_run:         i32,
+    pub overread_by:     usize,
+    /// True if we have seen end of image marker.
+    /// Don't read anything after that.
+    pub seen_eoi:        bool
+}
+
+impl BitStream {
+    /// Create a new BitStream
+    pub(crate) const fn new() -> BitStream {
+        BitStream {
+            buffer:          0,
+            aligned_buffer:  0,
+            bits_left:       0,
+            marker:          None,
+            successive_high: 0,
+            successive_low:  0,
+            spec_start:      0,
+            spec_end:        0,
+            eob_run:         0,
+            overread_by:     0,
+            seen_eoi:        false
+        }
+    }
+
+    /// Create a new Bitstream for progressive decoding
+    #[allow(clippy::redundant_field_names)]
+    pub(crate) fn new_progressive(ah: u8, al: u8, spec_start: u8, spec_end: u8) -> BitStream {
+        BitStream {
+            buffer:          0,
+            aligned_buffer:  0,
+            bits_left:       0,
+            marker:          None,
+            successive_high: ah,
+            successive_low:  al,
+            spec_start:      spec_start,
+            spec_end:        spec_end,
+            eob_run:         0,
+            overread_by:     0,
+            seen_eoi:        false
+        }
+    }
+
+    /// Refill the bit buffer by (a maximum of) 32 bits
+    ///
+    /// # Arguments
+    ///  - `reader`:`&mut BufReader<R>`: A mutable reference to an underlying
+    ///    File/Memory buffer containing a valid JPEG stream
+    ///
+    /// This function will only refill if `self.count` is less than 32
+    #[inline(always)] // to many call sites? ( perf improvement by 4%)
+    fn refill<T>(&mut self, reader: &mut ZReader<T>) -> Result<bool, DecodeErrors>
+    where
+        T: ZByteReaderTrait
+    {
+        /// Macro version of a single byte refill.
+        /// Arguments
+        /// buffer-> our io buffer, because rust macros cannot get values from
+        /// the surrounding environment bits_left-> number of bits left
+        /// to full refill
+        macro_rules! refill {
+            ($buffer:expr,$byte:expr,$bits_left:expr) => {
+                // read a byte from the stream
+                $byte = u64::from(reader.read_u8());
+                self.overread_by += usize::from(reader.eof()?);
+                // append to the buffer
+                // JPEG is a MSB type buffer so that means we append this
+                // to the lower end (0..8) of the buffer and push the rest bits above..
+                $buffer = ($buffer << 8) | $byte;
+                // Increment bits left
+                $bits_left += 8;
+                // Check for special case  of OxFF, to see if it's a stream or a marker
+                if $byte == 0xff {
+                    // read next byte
+                    let mut next_byte = u64::from(reader.read_u8());
+                    // Byte snuffing, if we encounter byte snuff, we skip the byte
+                    if next_byte != 0x00 {
+                        // skip that byte we read
+                        while next_byte == 0xFF {
+                            next_byte = u64::from(reader.read_u8());
+                        }
+
+                        if next_byte != 0x00 {
+                            // Undo the byte append and return
+                            $buffer >>= 8;
+                            $bits_left -= 8;
+
+                            if $bits_left != 0 {
+                                self.aligned_buffer = $buffer << (64 - $bits_left);
+                            }
+
+                            self.marker =
+                                Some(Marker::from_u8(next_byte as u8).ok_or_else(|| {
+                                    DecodeErrors::Format(format!(
+                                        "Unknown marker 0xFF{:X}",
+                                        next_byte
+                                    ))
+                                })?);
+                            return Ok(false);
+                        }
+                    }
+                }
+            };
+        }
+
+        // 32 bits is enough for a decode(16 bits) and receive_extend(max 16 bits)
+        // If we have less than 32 bits we refill
+        if self.bits_left < 32 && self.marker.is_none() && !self.seen_eoi {
+            // we optimize for the case where we don't have 255 in the stream and have 4 bytes left
+            // as it is the common case
+            //
+            // so we always read 4 bytes, if read_fixed_bytes errors out, the cursor is
+            // guaranteed not to advance in case of failure (is this true), so
+            // we revert the read later on (if we have 255), if this fails, we use the normal
+            // byte at a time read
+            if let Ok(bytes) = reader.read_fixed_bytes_or_error::<4>() {
+                // we have 4 bytes to spare, read the 4 bytes into a temporary buffer
+                // create buffer
+                let msb_buf = u32::from_be_bytes(bytes);
+                // check if we have 0xff
+                if !has_byte(msb_buf, 255) {
+                    self.bits_left += 32;
+                    self.buffer <<= 32;
+                    self.buffer |= u64::from(msb_buf);
+                    self.aligned_buffer = self.buffer << (64 - self.bits_left);
+                    return Ok(true);
+                }
+
+                reader.rewind(4)?;
+            }
+            // This serves two reasons,
+            // 1: Make clippy shut up
+            // 2: Favour register reuse
+            let mut byte;
+            // 4 refills, if all succeed the stream should contain enough bits to decode a
+            // value
+            refill!(self.buffer, byte, self.bits_left);
+            refill!(self.buffer, byte, self.bits_left);
+            refill!(self.buffer, byte, self.bits_left);
+            refill!(self.buffer, byte, self.bits_left);
+            // Construct an MSB buffer whose top bits are the bitstream we are currently holding.
+            self.aligned_buffer = self.buffer << (64 - self.bits_left);
+        }
+        return Ok(true);
+    }
+    /// Decode the DC coefficient in a MCU block.
+    ///
+    /// The decoded coefficient is written to `dc_prediction`
+    ///
+    #[allow(
+        clippy::cast_possible_truncation,
+        clippy::cast_sign_loss,
+        clippy::unwrap_used
+    )]
+    #[inline(always)]
+    fn decode_dc<T>(
+        &mut self, reader: &mut ZReader<T>, dc_table: &HuffmanTable, dc_prediction: &mut i32
+    ) -> Result<bool, DecodeErrors>
+    where
+        T: ZByteReaderTrait
+    {
+        let (mut symbol, r);
+
+        if self.bits_left < 32 {
+            self.refill(reader)?;
+        };
+        // look a head HUFF_LOOKAHEAD bits into the bitstream
+        symbol = self.peek_bits::<HUFF_LOOKAHEAD>();
+        symbol = dc_table.lookup[symbol as usize];
+
+        decode_huff!(self, symbol, dc_table);
+
+        if symbol != 0 {
+            r = self.get_bits(symbol as u8);
+            symbol = huff_extend(r, symbol);
+        }
+        // Update DC prediction
+        *dc_prediction = dc_prediction.wrapping_add(symbol);
+
+        return Ok(true);
+    }
+
+    /// Decode a Minimum Code Unit(MCU) as quickly as possible
+    ///
+    /// # Arguments
+    /// - reader: The bitstream from where we read more bits.
+    /// - dc_table: The Huffman table used to decode the DC coefficient
+    /// - ac_table: The Huffman table used to decode AC values
+    /// - block: A memory region where we will write out the decoded values
+    /// - DC prediction: Last DC value for this component
+    ///
+    #[allow(
+        clippy::many_single_char_names,
+        clippy::cast_possible_truncation,
+        clippy::cast_sign_loss
+    )]
+    #[inline(never)]
+    pub fn decode_mcu_block<T>(
+        &mut self, reader: &mut ZReader<T>, dc_table: &HuffmanTable, ac_table: &HuffmanTable,
+        qt_table: &[i32; DCT_BLOCK], block: &mut [i32; 64], dc_prediction: &mut i32
+    ) -> Result<(), DecodeErrors>
+    where
+        T: ZByteReaderTrait
+    {
+        // Get fast AC table as a reference before we enter the hot path
+        let ac_lookup = ac_table.ac_lookup.as_ref().unwrap();
+
+        let (mut symbol, mut r, mut fast_ac);
+        // Decode AC coefficients
+        let mut pos: usize = 1;
+
+        // decode DC, dc prediction will contain the value
+        self.decode_dc(reader, dc_table, dc_prediction)?;
+
+        // set dc to be the dc prediction.
+        block[0] = *dc_prediction * qt_table[0];
+
+        while pos < 64 {
+            self.refill(reader)?;
+            symbol = self.peek_bits::<HUFF_LOOKAHEAD>();
+            fast_ac = ac_lookup[symbol as usize];
+            symbol = ac_table.lookup[symbol as usize];
+
+            if fast_ac != 0 {
+                //  FAST AC path
+                pos += ((fast_ac >> 4) & 15) as usize; // run
+                let t_pos = UN_ZIGZAG[min(pos, 63)] & 63;
+
+                block[t_pos] = i32::from(fast_ac >> 8) * (qt_table[t_pos]); // Value
+                self.drop_bits((fast_ac & 15) as u8);
+                pos += 1;
+            } else {
+                decode_huff!(self, symbol, ac_table);
+
+                r = symbol >> 4;
+                symbol &= 15;
+
+                if symbol != 0 {
+                    pos += r as usize;
+                    r = self.get_bits(symbol as u8);
+                    symbol = huff_extend(r, symbol);
+                    let t_pos = UN_ZIGZAG[pos & 63] & 63;
+
+                    block[t_pos] = symbol * qt_table[t_pos];
+
+                    pos += 1;
+                } else if r != 15 {
+                    return Ok(());
+                } else {
+                    pos += 16;
+                }
+            }
+        }
+        return Ok(());
+    }
+
+    /// Peek `look_ahead` bits ahead without discarding them from the buffer
+    #[inline(always)]
+    #[allow(clippy::cast_possible_truncation)]
+    const fn peek_bits<const LOOKAHEAD: u8>(&self) -> i32 {
+        (self.aligned_buffer >> (64 - LOOKAHEAD)) as i32
+    }
+
+    /// Discard the next `N` bits without checking
+    #[inline]
+    fn drop_bits(&mut self, n: u8) {
+        self.bits_left = self.bits_left.saturating_sub(n);
+        self.aligned_buffer <<= n;
+    }
+
+    /// Read `n_bits` from the buffer  and discard them
+    #[inline(always)]
+    #[allow(clippy::cast_possible_truncation)]
+    fn get_bits(&mut self, n_bits: u8) -> i32 {
+        let mask = (1_u64 << n_bits) - 1;
+
+        self.aligned_buffer = self.aligned_buffer.rotate_left(u32::from(n_bits));
+        let bits = (self.aligned_buffer & mask) as i32;
+        self.bits_left = self.bits_left.wrapping_sub(n_bits);
+        bits
+    }
+
+    /// Decode a DC block
+    #[allow(clippy::cast_possible_truncation)]
+    #[inline]
+    pub(crate) fn decode_prog_dc_first<T>(
+        &mut self, reader: &mut ZReader<T>, dc_table: &HuffmanTable, block: &mut i16,
+        dc_prediction: &mut i32
+    ) -> Result<(), DecodeErrors>
+    where
+        T: ZByteReaderTrait
+    {
+        self.decode_dc(reader, dc_table, dc_prediction)?;
+        *block = (*dc_prediction as i16).wrapping_mul(1_i16 << self.successive_low);
+        return Ok(());
+    }
+    #[inline]
+    pub(crate) fn decode_prog_dc_refine<T>(
+        &mut self, reader: &mut ZReader<T>, block: &mut i16
+    ) -> Result<(), DecodeErrors>
+    where
+        T: ZByteReaderTrait
+    {
+        // refinement scan
+        if self.bits_left < 1 {
+            self.refill(reader)?;
+        }
+
+        if self.get_bit() == 1 {
+            *block = block.wrapping_add(1 << self.successive_low);
+        }
+
+        Ok(())
+    }
+
+    /// Get a single bit from the bitstream
+    fn get_bit(&mut self) -> u8 {
+        let k = (self.aligned_buffer >> 63) as u8;
+        // discard a bit
+        self.drop_bits(1);
+        return k;
+    }
+    pub(crate) fn decode_mcu_ac_first<T>(
+        &mut self, reader: &mut ZReader<T>, ac_table: &HuffmanTable, block: &mut [i16; 64]
+    ) -> Result<bool, DecodeErrors>
+    where
+        T: ZByteReaderTrait
+    {
+        let shift = self.successive_low;
+        let fast_ac = ac_table.ac_lookup.as_ref().unwrap();
+
+        let mut k = self.spec_start as usize;
+        let (mut symbol, mut r, mut fac);
+
+        // EOB runs are handled in mcu_prog.rs
+        'block: loop {
+            self.refill(reader)?;
+
+            symbol = self.peek_bits::<HUFF_LOOKAHEAD>();
+            fac = fast_ac[symbol as usize];
+            symbol = ac_table.lookup[symbol as usize];
+
+            if fac != 0 {
+                // fast ac path
+                k += ((fac >> 4) & 15) as usize; // run
+                block[UN_ZIGZAG[min(k, 63)] & 63] = (fac >> 8).wrapping_mul(1 << shift); // value
+                self.drop_bits((fac & 15) as u8);
+                k += 1;
+            } else {
+                decode_huff!(self, symbol, ac_table);
+
+                r = symbol >> 4;
+                symbol &= 15;
+
+                if symbol != 0 {
+                    k += r as usize;
+                    r = self.get_bits(symbol as u8);
+                    symbol = huff_extend(r, symbol);
+                    block[UN_ZIGZAG[k & 63] & 63] = (symbol as i16).wrapping_mul(1 << shift);
+                    k += 1;
+                } else {
+                    if r != 15 {
+                        self.eob_run = 1 << r;
+                        self.eob_run += self.get_bits(r as u8);
+                        self.eob_run -= 1;
+                        break;
+                    }
+
+                    k += 16;
+                }
+            }
+
+            if k > self.spec_end as usize {
+                break 'block;
+            }
+        }
+        return Ok(true);
+    }
+    #[allow(clippy::too_many_lines, clippy::op_ref)]
+    pub(crate) fn decode_mcu_ac_refine<T>(
+        &mut self, reader: &mut ZReader<T>, table: &HuffmanTable, block: &mut [i16; 64]
+    ) -> Result<bool, DecodeErrors>
+    where
+        T: ZByteReaderTrait
+    {
+        let bit = (1 << self.successive_low) as i16;
+
+        let mut k = self.spec_start;
+        let (mut symbol, mut r);
+
+        if self.eob_run == 0 {
+            'no_eob: loop {
+                // Decode a coefficient from the bit stream
+                self.refill(reader)?;
+
+                symbol = self.peek_bits::<HUFF_LOOKAHEAD>();
+                symbol = table.lookup[symbol as usize];
+
+                decode_huff!(self, symbol, table);
+
+                r = symbol >> 4;
+                symbol &= 15;
+
+                if symbol == 0 {
+                    if r != 15 {
+                        // EOB run is 2^r + bits
+                        self.eob_run = 1 << r;
+                        self.eob_run += self.get_bits(r as u8);
+                        // EOB runs are handled by the eob logic
+                        break 'no_eob;
+                    }
+                } else {
+                    if symbol != 1 {
+                        return Err(DecodeErrors::HuffmanDecode(
+                            "Bad Huffman code, corrupt JPEG?".to_string()
+                        ));
+                    }
+                    // get sign bit
+                    // We assume we have enough bits, which should be correct for sane images
+                    // since we refill by 32 above
+                    if self.get_bit() == 1 {
+                        symbol = i32::from(bit);
+                    } else {
+                        symbol = i32::from(-bit);
+                    }
+                }
+
+                // Advance over already nonzero coefficients  appending
+                // correction bits to the non-zeroes.
+                // A correction bit is 1 if the absolute value of the coefficient must be increased
+
+                if k <= self.spec_end {
+                    'advance_nonzero: loop {
+                        let coefficient = &mut block[UN_ZIGZAG[k as usize & 63] & 63];
+
+                        if *coefficient != 0 {
+                            if self.get_bit() == 1 && (*coefficient & bit) == 0 {
+                                if *coefficient >= 0 {
+                                    *coefficient += bit;
+                                } else {
+                                    *coefficient -= bit;
+                                }
+                            }
+
+                            if self.bits_left < 1 {
+                                self.refill(reader)?;
+                            }
+                        } else {
+                            r -= 1;
+
+                            if r < 0 {
+                                // reached target zero coefficient.
+                                break 'advance_nonzero;
+                            }
+                        };
+
+                        if k == self.spec_end {
+                            break 'advance_nonzero;
+                        }
+
+                        k += 1;
+                    }
+                }
+
+                if symbol != 0 {
+                    let pos = UN_ZIGZAG[k as usize & 63];
+                    // output new non-zero coefficient.
+                    block[pos & 63] = symbol as i16;
+                }
+
+                k += 1;
+
+                if k > self.spec_end {
+                    break 'no_eob;
+                }
+            }
+        }
+        if self.eob_run > 0 {
+            // only run if block does not consists of purely zeroes
+            if &block[1..] != &[0; 63] {
+                self.refill(reader)?;
+
+                while k <= self.spec_end {
+                    let coefficient = &mut block[UN_ZIGZAG[k as usize & 63] & 63];
+
+                    if *coefficient != 0 && self.get_bit() == 1 {
+                        // check if we already modified it, if so do nothing, otherwise
+                        // append the correction bit.
+                        if (*coefficient & bit) == 0 {
+                            if *coefficient >= 0 {
+                                *coefficient = coefficient.wrapping_add(bit);
+                            } else {
+                                *coefficient = coefficient.wrapping_sub(bit);
+                            }
+                        }
+                    }
+                    if self.bits_left < 1 {
+                        // refill at the last possible moment
+                        self.refill(reader)?;
+                    }
+                    k += 1;
+                }
+            }
+            // count a block completed in EOB run
+            self.eob_run -= 1;
+        }
+        return Ok(true);
+    }
+
+    pub fn update_progressive_params(&mut self, ah: u8, al: u8, spec_start: u8, spec_end: u8) {
+        self.successive_high = ah;
+        self.successive_low = al;
+        self.spec_start = spec_start;
+        self.spec_end = spec_end;
+    }
+
+    /// Reset the stream if we have a restart marker
+    ///
+    /// Restart markers indicate drop those bits in the stream and zero out
+    /// everything
+    #[cold]
+    pub fn reset(&mut self) {
+        self.bits_left = 0;
+        self.marker = None;
+        self.buffer = 0;
+        self.aligned_buffer = 0;
+        self.eob_run = 0;
+    }
+}
+
+/// Do the equivalent of JPEG HUFF_EXTEND
+#[inline(always)]
+fn huff_extend(x: i32, s: i32) -> i32 {
+    // if x<s return x else return x+offset[s] where offset[s] = ( (-1<<s)+1)
+    (x) + ((((x) - (1 << ((s) - 1))) >> 31) & (((-1) << (s)) + 1))
+}
+
+const fn has_zero(v: u32) -> bool {
+    // Retrieved from Stanford bithacks
+    // @ https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
+    return !((((v & 0x7F7F_7F7F) + 0x7F7F_7F7F) | v) | 0x7F7F_7F7F) != 0;
+}
+
+const fn has_byte(b: u32, val: u8) -> bool {
+    // Retrieved from Stanford bithacks
+    // @ https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
+    has_zero(b ^ ((!0_u32 / 255) * (val as u32)))
+}
--- a/third_party/zune-jpeg/src/color_convert.rs
+++ b/third_party/zune-jpeg/src/color_convert.rs
@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+#![allow(
+    clippy::many_single_char_names,
+    clippy::similar_names,
+    clippy::cast_possible_truncation,
+    clippy::cast_sign_loss,
+    clippy::cast_possible_wrap,
+    clippy::too_many_arguments,
+    clippy::doc_markdown
+)]
+
+//! Color space conversion routines
+//!
+//! This files exposes functions to convert one colorspace to another in a jpeg
+//! image
+//!
+//! Currently supported conversions are
+//!
+//! - `YCbCr` to `RGB,RGBA,GRAYSCALE,RGBX`.
+//!
+//!
+//! Hey there, if your reading this it means you probably need something, so let me help you.
+//!
+//! There are 3 supported cpu extensions here.
+//! 1. Scalar
+//! 2. SSE
+//! 3. AVX
+//!
+//! There are two types of the color convert functions
+//!
+//! 1. Acts on 16 pixels.
+//! 2. Acts on 8 pixels.
+//!
+//! The reason for this is because when implementing the AVX part it occurred to me that we can actually
+//! do better and process 2 MCU's if we change IDCT return type to be `i16's`, since a lot of
+//! CPU's these days support AVX extensions, it becomes nice if we optimize for that path ,
+//! therefore AVX routines can process 16 pixels directly and SSE and Scalar just compensate.
+//!
+//! By compensating, I mean I wrote the 16 pixels version operating on the 8 pixel version twice.
+//!
+//! Therefore if your looking to optimize some routines, probably start there.
+
+pub use scalar::ycbcr_to_grayscale;
+use zune_core::colorspace::ColorSpace;
+use zune_core::options::DecoderOptions;
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[cfg(feature = "x86")]
+pub use crate::color_convert::avx::{ycbcr_to_rgb_avx2, ycbcr_to_rgba_avx2};
+use crate::decoder::ColorConvert16Ptr;
+
+mod avx;
+mod scalar;
+#[allow(unused_variables)]
+pub fn choose_ycbcr_to_rgb_convert_func(
+    type_need: ColorSpace, options: &DecoderOptions
+) -> Option<ColorConvert16Ptr> {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[cfg(feature = "x86")]
+    {
+        use zune_core::log::debug;
+        if options.use_avx2() {
+            debug!("Using AVX optimised color conversion functions");
+
+            // I believe avx2 means sse4 is also available
+            // match colorspace
+            match type_need {
+                ColorSpace::RGB => return Some(ycbcr_to_rgb_avx2),
+                ColorSpace::RGBA => return Some(ycbcr_to_rgba_avx2),
+                _ => () // fall through to scalar, which has more types
+            };
+        }
+    }
+    // when there is no x86 or we haven't returned by here, resort to scalar
+    return match type_need {
+        ColorSpace::RGB => Some(scalar::ycbcr_to_rgb_inner_16_scalar::<false>),
+        ColorSpace::RGBA => Some(scalar::ycbcr_to_rgba_inner_16_scalar::<false>),
+        ColorSpace::BGRA => Some(scalar::ycbcr_to_rgba_inner_16_scalar::<true>),
+        ColorSpace::BGR => Some(scalar::ycbcr_to_rgb_inner_16_scalar::<true>),
+        _ => None
+    };
+}
--- a/third_party/zune-jpeg/src/color_convert/avx.rs
+++ b/third_party/zune-jpeg/src/color_convert/avx.rs
@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+//! AVX color conversion routines
+//!
+//! Okay these codes are cool
+//!
+//! Herein lies super optimized codes to do color conversions.
+//!
+//!
+//! 1. The YCbCr to RGB use integer approximations and not the floating point equivalent.
+//! That means we may be +- 2 of pixels generated by libjpeg-turbo jpeg decoding
+//! (also libjpeg uses routines like `Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G`)
+//!
+//! Firstly, we use integers (fun fact:there is no part of this code base where were dealing with
+//! floating points.., fun fact: the first fun fact wasn't even fun.)
+//!
+//! Secondly ,we have cool clamping code, especially for rgba , where we don't need clamping and we
+//! spend our time cursing that Intel decided permute instructions to work like 2 128 bit vectors(the compiler opitmizes
+//! it out to something cool).
+//!
+//! There isn't a lot here (not as fun as bitstream ) but I hope you find what you're looking for.
+//!
+//! O and ~~subscribe to my youtube channel~~
+
+#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#![cfg(feature = "x86")]
+#![allow(
+    clippy::wildcard_imports,
+    clippy::cast_possible_truncation,
+    clippy::too_many_arguments,
+    clippy::inline_always,
+    clippy::doc_markdown,
+    dead_code
+)]
+
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+
+pub union YmmRegister {
+    // both are 32 when using std::mem::size_of
+    mm256: __m256i,
+    // for avx color conversion
+    array: [i16; 16]
+}
+
+//--------------------------------------------------------------------------------------------------
+// AVX conversion routines
+//--------------------------------------------------------------------------------------------------
+
+///
+/// Convert YCBCR to RGB using AVX instructions
+///
+///  # Note
+///**IT IS THE RESPONSIBILITY OF THE CALLER TO CALL THIS IN CPUS SUPPORTING
+/// AVX2 OTHERWISE THIS IS UB**
+///
+/// *Peace*
+///
+/// This library itself will ensure that it's never called in CPU's not
+/// supporting AVX2
+///
+/// # Arguments
+/// - `y`,`cb`,`cr`: A reference of 8 i32's
+/// - `out`: The output  array where we store our converted items
+/// - `offset`: The position from 0 where we write these RGB values
+#[inline(always)]
+pub fn ycbcr_to_rgb_avx2(
+    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
+) {
+    // call this in another function to tell RUST to vectorize this
+    // storing
+    unsafe {
+        ycbcr_to_rgb_avx2_1(y, cb, cr, out, offset);
+    }
+}
+
+#[inline]
+#[target_feature(enable = "avx2")]
+#[target_feature(enable = "avx")]
+unsafe fn ycbcr_to_rgb_avx2_1(
+    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
+) {
+    // Load output buffer
+    let tmp: &mut [u8; 48] = out
+        .get_mut(*offset..*offset + 48)
+        .expect("Slice to small cannot write")
+        .try_into()
+        .unwrap();
+
+    let (r, g, b) = ycbcr_to_rgb_baseline(y, cb, cr);
+
+    let mut j = 0;
+    let mut i = 0;
+    while i < 48 {
+        tmp[i] = r.array[j] as u8;
+
+        tmp[i + 1] = g.array[j] as u8;
+        tmp[i + 2] = b.array[j] as u8;
+        i += 3;
+        j += 1;
+    }
+
+    *offset += 48;
+}
+
+/// Baseline implementation of YCBCR to RGB for avx,
+///
+/// It uses integer operations as opposed to floats, the approximation is
+/// difficult for the  eye to see, but this means that it may produce different
+/// values with libjpeg_turbo.  if accuracy is of utmost importance, use that.
+///
+/// this function should be called for most implementations, including
+/// - ycbcr->rgb
+/// - ycbcr->rgba
+/// - ycbcr->brga
+/// - ycbcr->rgbx
+#[inline]
+#[target_feature(enable = "avx2")]
+#[target_feature(enable = "avx")]
+unsafe fn ycbcr_to_rgb_baseline(
+    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16]
+) -> (YmmRegister, YmmRegister, YmmRegister) {
+    // Load values into a register
+    //
+    // dst[127:0] := MEM[loaddr+127:loaddr]
+    // dst[255:128] := MEM[hiaddr+127:hiaddr]
+    let y_c = _mm256_loadu_si256(y.as_ptr().cast());
+
+    let cb_c = _mm256_loadu_si256(cb.as_ptr().cast());
+
+    let cr_c = _mm256_loadu_si256(cr.as_ptr().cast());
+
+    // AVX version of integer version in https://stackoverflow.com/questions/4041840/function-to-convert-ycbcr-to-rgb
+
+    // Cb = Cb-128;
+    let cb_r = _mm256_sub_epi16(cb_c, _mm256_set1_epi16(128));
+
+    // cr = Cb -128;
+    let cr_r = _mm256_sub_epi16(cr_c, _mm256_set1_epi16(128));
+
+    // Calculate Y->R
+    // r = Y + 45 * Cr / 32
+    // 45*cr
+    let r1 = _mm256_mullo_epi16(_mm256_set1_epi16(45), cr_r);
+
+    // r1>>5
+    let r2 = _mm256_srai_epi16::<5>(r1);
+
+    //y+r2
+
+    let r = YmmRegister {
+        mm256: clamp_avx(_mm256_add_epi16(y_c, r2))
+    };
+
+    // g = Y - (11 * Cb + 23 * Cr) / 32 ;
+
+    // 11*cb
+    let g1 = _mm256_mullo_epi16(_mm256_set1_epi16(11), cb_r);
+
+    // 23*cr
+    let g2 = _mm256_mullo_epi16(_mm256_set1_epi16(23), cr_r);
+
+    //(11
+    //(11 * Cb + 23 * Cr)
+    let g3 = _mm256_add_epi16(g1, g2);
+
+    // (11 * Cb + 23 * Cr) / 32
+    let g4 = _mm256_srai_epi16::<5>(g3);
+
+    // Y - (11 * Cb + 23 * Cr) / 32 ;
+    let g = YmmRegister {
+        mm256: clamp_avx(_mm256_sub_epi16(y_c, g4))
+    };
+
+    // b = Y + 113 * Cb / 64
+    // 113 * cb
+    let b1 = _mm256_mullo_epi16(_mm256_set1_epi16(113), cb_r);
+
+    //113 * Cb / 64
+    let b2 = _mm256_srai_epi16::<6>(b1);
+
+    // b = Y + 113 * Cb / 64 ;
+    let b = YmmRegister {
+        mm256: clamp_avx(_mm256_add_epi16(b2, y_c))
+    };
+
+    return (r, g, b);
+}
+
+#[inline]
+#[target_feature(enable = "avx2")]
+/// A baseline implementation of YCbCr to RGB conversion which does not carry
+/// out clamping
+///
+/// This is used by the `ycbcr_to_rgba_avx` and `ycbcr_to_rgbx` conversion
+/// routines
+unsafe fn ycbcr_to_rgb_baseline_no_clamp(
+    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16]
+) -> (__m256i, __m256i, __m256i) {
+    // Load values into a register
+    //
+    let y_c = _mm256_loadu_si256(y.as_ptr().cast());
+
+    let cb_c = _mm256_loadu_si256(cb.as_ptr().cast());
+
+    let cr_c = _mm256_loadu_si256(cr.as_ptr().cast());
+
+    // AVX version of integer version in https://stackoverflow.com/questions/4041840/function-to-convert-ycbcr-to-rgb
+
+    // Cb = Cb-128;
+    let cb_r = _mm256_sub_epi16(cb_c, _mm256_set1_epi16(128));
+
+    // cr = Cb -128;
+    let cr_r = _mm256_sub_epi16(cr_c, _mm256_set1_epi16(128));
+
+    // Calculate Y->R
+    // r = Y + 45 * Cr / 32
+    // 45*cr
+    let r1 = _mm256_mullo_epi16(_mm256_set1_epi16(45), cr_r);
+
+    // r1>>5
+    let r2 = _mm256_srai_epi16::<5>(r1);
+
+    //y+r2
+
+    let r = _mm256_add_epi16(y_c, r2);
+
+    // g = Y - (11 * Cb + 23 * Cr) / 32 ;
+
+    // 11*cb
+    let g1 = _mm256_mullo_epi16(_mm256_set1_epi16(11), cb_r);
+
+    // 23*cr
+    let g2 = _mm256_mullo_epi16(_mm256_set1_epi16(23), cr_r);
+
+    //(11
+    //(11 * Cb + 23 * Cr)
+    let g3 = _mm256_add_epi16(g1, g2);
+
+    // (11 * Cb + 23 * Cr) / 32
+    let g4 = _mm256_srai_epi16::<5>(g3);
+
+    // Y - (11 * Cb + 23 * Cr) / 32 ;
+    let g = _mm256_sub_epi16(y_c, g4);
+
+    // b = Y + 113 * Cb / 64
+    // 113 * cb
+    let b1 = _mm256_mullo_epi16(_mm256_set1_epi16(113), cb_r);
+
+    //113 * Cb / 64
+    let b2 = _mm256_srai_epi16::<6>(b1);
+
+    // b = Y + 113 * Cb / 64 ;
+    let b = _mm256_add_epi16(b2, y_c);
+
+    return (r, g, b);
+}
+
+#[inline(always)]
+pub fn ycbcr_to_rgba_avx2(
+    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
+) {
+    unsafe {
+        ycbcr_to_rgba_unsafe(y, cb, cr, out, offset);
+    }
+}
+
+#[inline]
+#[target_feature(enable = "avx2")]
+#[rustfmt::skip]
+unsafe fn ycbcr_to_rgba_unsafe(
+    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16],
+    out: &mut [u8],
+    offset: &mut usize,
+)
+{
+    // check if we have enough space to write.
+    let tmp:& mut [u8; 64] = out.get_mut(*offset..*offset + 64).expect("Slice to small cannot write").try_into().unwrap();
+
+    let (r, g, b) = ycbcr_to_rgb_baseline_no_clamp(y, cb, cr);
+
+    // set alpha channel to 255 for opaque
+
+    // And no these comments were not from me pressing the keyboard
+
+    // Pack the integers into u8's using signed saturation.
+    let c = _mm256_packus_epi16(r, g); //aaaaa_bbbbb_aaaaa_bbbbbb
+    let d = _mm256_packus_epi16(b, _mm256_set1_epi16(255)); // cccccc_dddddd_ccccccc_ddddd
+    // transpose_u16 and interleave channels
+    let e = _mm256_unpacklo_epi8(c, d); //ab_ab_ab_ab_ab_ab_ab_ab
+    let f = _mm256_unpackhi_epi8(c, d); //cd_cd_cd_cd_cd_cd_cd_cd
+    // final transpose_u16
+    let g = _mm256_unpacklo_epi8(e, f); //abcd_abcd_abcd_abcd_abcd
+    let h = _mm256_unpackhi_epi8(e, f);
+
+
+    // undo packus shuffling...
+    let i = _mm256_permute2x128_si256::<{ shuffle(3, 2, 1, 0) }>(g, h);
+
+    let j = _mm256_permute2x128_si256::<{ shuffle(1, 2, 3, 0) }>(g, h);
+
+    let k = _mm256_permute2x128_si256::<{ shuffle(3, 2, 0, 1) }>(g, h);
+
+    let l = _mm256_permute2x128_si256::<{ shuffle(0, 3, 2, 1) }>(g, h);
+
+    let m = _mm256_blend_epi32::<0b1111_0000>(i, j);
+
+    let n = _mm256_blend_epi32::<0b1111_0000>(k, l);
+
+
+    // Store
+    // Use streaming instructions to prevent polluting the cache?
+    _mm256_storeu_si256(tmp.as_mut_ptr().cast(), m);
+
+    _mm256_storeu_si256(tmp[32..].as_mut_ptr().cast(), n);
+
+    *offset += 64;
+}
+
+/// Clamp values between 0 and 255
+///
+/// This function clamps all values in `reg` to be between 0 and 255
+///( the accepted values for RGB)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+unsafe fn clamp_avx(reg: __m256i) -> __m256i {
+    // the lowest value
+    let min_s = _mm256_set1_epi16(0);
+
+    // Highest value
+    let max_s = _mm256_set1_epi16(255);
+
+    let max_v = _mm256_max_epi16(reg, min_s); //max(a,0)
+    let min_v = _mm256_min_epi16(max_v, max_s); //min(max(a,0),255)
+    return min_v;
+}
+
+#[inline]
+const fn shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
+    (z << 6) | (y << 4) | (x << 2) | w
+}
--- a/third_party/zune-jpeg/src/color_convert/scalar.rs
+++ b/third_party/zune-jpeg/src/color_convert/scalar.rs
@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+use core::convert::TryInto;
+
+/// Limit values to 0 and 255
+#[inline]
+#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss, dead_code)]
+fn clamp(a: i16) -> u8 {
+    a.clamp(0, 255) as u8
+}
+
+/// YCbCr to RGBA color conversion
+
+/// Convert YCbCr to RGB/BGR
+///
+/// Converts to RGB if const BGRA is false
+///
+/// Converts to BGR if const BGRA is true
+pub fn ycbcr_to_rgba_inner_16_scalar<const BGRA: bool>(
+    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], output: &mut [u8], pos: &mut usize
+) {
+    let (_, output_position) = output.split_at_mut(*pos);
+
+    // Convert into a slice with 64 elements for Rust to see we won't go out of bounds.
+    let opt: &mut [u8; 64] = output_position
+        .get_mut(0..64)
+        .expect("Slice to small cannot write")
+        .try_into()
+        .unwrap();
+    for ((y, (cb, cr)), out) in y
+        .iter()
+        .zip(cb.iter().zip(cr.iter()))
+        .zip(opt.chunks_exact_mut(4))
+    {
+        let cr = cr - 128;
+        let cb = cb - 128;
+
+        let r = y + ((45_i16.wrapping_mul(cr)) >> 5);
+        let g = y - ((11_i16.wrapping_mul(cb) + 23_i16.wrapping_mul(cr)) >> 5);
+        let b = y + ((113_i16.wrapping_mul(cb)) >> 6);
+
+        if BGRA {
+            out[0] = clamp(b);
+            out[1] = clamp(g);
+            out[2] = clamp(r);
+            out[3] = 255;
+        } else {
+            out[0] = clamp(r);
+            out[1] = clamp(g);
+            out[2] = clamp(b);
+            out[3] = 255;
+        }
+    }
+    *pos += 64;
+}
+
+/// Convert YCbCr to RGB/BGR
+///
+/// Converts to RGB if const BGRA is false
+///
+/// Converts to BGR if const BGRA is true
+pub fn ycbcr_to_rgb_inner_16_scalar<const BGRA: bool>(
+    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], output: &mut [u8], pos: &mut usize
+) {
+    let (_, output_position) = output.split_at_mut(*pos);
+
+    // Convert into a slice with 48 elements
+    let opt: &mut [u8; 48] = output_position
+        .get_mut(0..48)
+        .expect("Slice to small cannot write")
+        .try_into()
+        .unwrap();
+
+    for ((y, (cb, cr)), out) in y
+        .iter()
+        .zip(cb.iter().zip(cr.iter()))
+        .zip(opt.chunks_exact_mut(3))
+    {
+        let cr = cr - 128;
+        let cb = cb - 128;
+
+        let r = y + ((45_i16.wrapping_mul(cr)) >> 5);
+        let g = y - ((11_i16.wrapping_mul(cb) + 23_i16.wrapping_mul(cr)) >> 5);
+        let b = y + ((113_i16.wrapping_mul(cb)) >> 6);
+
+        if BGRA {
+            out[0] = clamp(b);
+            out[1] = clamp(g);
+            out[2] = clamp(r);
+        } else {
+            out[0] = clamp(r);
+            out[1] = clamp(g);
+            out[2] = clamp(b);
+        }
+    }
+
+    // Increment pos
+    *pos += 48;
+}
+
+pub fn ycbcr_to_grayscale(y: &[i16], width: usize, padded_width: usize, output: &mut [u8]) {
+    for (y_in, out) in y
+        .chunks_exact(padded_width)
+        .zip(output.chunks_exact_mut(width))
+    {
+        for (y, out) in y_in.iter().zip(out.iter_mut()) {
+            *out = *y as u8;
+        }
+    }
+}
--- a/third_party/zune-jpeg/src/components.rs
+++ b/third_party/zune-jpeg/src/components.rs
@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+//! This module exports a single struct to store information about
+//! JPEG image components
+//!
+//! The data is extracted from a SOF header.
+
+use alloc::vec::Vec;
+use alloc::{format, vec};
+
+use zune_core::log::trace;
+
+use crate::decoder::MAX_COMPONENTS;
+use crate::errors::DecodeErrors;
+use crate::upsampler::upsample_no_op;
+
+/// Represents an up-sampler function, this function will be called to upsample
+/// a down-sampled image
+
+pub type UpSampler = fn(
+    input: &[i16],
+    in_near: &[i16],
+    in_far: &[i16],
+    scratch_space: &mut [i16],
+    output: &mut [i16]
+);
+
+/// Component Data from start of frame
+#[derive(Clone)]
+pub(crate) struct Components {
+    /// The type of component that has the metadata below, can be Y,Cb or Cr
+    pub component_id: ComponentID,
+    /// Sub-sampling ratio of this component in the x-plane
+    pub vertical_sample: usize,
+    /// Sub-sampling ratio of this component in the y-plane
+    pub horizontal_sample: usize,
+    /// DC huffman table position
+    pub dc_huff_table: usize,
+    /// AC huffman table position for this element.
+    pub ac_huff_table: usize,
+    /// Quantization table number
+    pub quantization_table_number: u8,
+    /// Specifies quantization table to use with this component
+    pub quantization_table: [i32; 64],
+    /// dc prediction for the component
+    pub dc_pred: i32,
+    /// An up-sampling function, can be basic or SSE, depending
+    /// on the platform
+    pub up_sampler: UpSampler,
+    /// How pixels do we need to go to get to the next line?
+    pub width_stride: usize,
+    /// Component ID for progressive
+    pub id: u8,
+    /// Whether we need to decode this image component.
+    pub needed: bool,
+    /// Upsample scanline
+    pub raw_coeff: Vec<i16>,
+    /// Upsample destination, stores a scanline worth of sub sampled data
+    pub upsample_dest: Vec<i16>,
+    /// previous row, used to handle MCU boundaries
+    pub row_up: Vec<i16>,
+    /// current row, used to handle MCU boundaries again
+    pub row: Vec<i16>,
+    pub first_row_upsample_dest: Vec<i16>,
+    pub idct_pos: usize,
+    pub x: usize,
+    pub w2: usize,
+    pub y: usize,
+    pub sample_ratio: SampleRatios,
+    // a very annoying bug
+    pub fix_an_annoying_bug: usize
+}
+
+impl Components {
+    /// Create a new instance from three bytes from the start of frame
+    #[inline]
+    pub fn from(a: [u8; 3], pos: u8) -> Result<Components, DecodeErrors> {
+        // it's a unique identifier.
+        // doesn't have to be ascending
+        // see tests/inputs/huge_sof_number
+        //
+        // For such cases, use the position of the component
+        // to determine width
+
+        let id = match pos {
+            0 => ComponentID::Y,
+            1 => ComponentID::Cb,
+            2 => ComponentID::Cr,
+            3 => ComponentID::Q,
+            _ => {
+                return Err(DecodeErrors::Format(format!(
+                    "Unknown component id found,{pos}, expected value between 1 and 4"
+                )))
+            }
+        };
+
+        let horizontal_sample = (a[1] >> 4) as usize;
+        let vertical_sample = (a[1] & 0x0f) as usize;
+        let quantization_table_number = a[2];
+        // confirm quantization number is between 0 and MAX_COMPONENTS
+        if usize::from(quantization_table_number) >= MAX_COMPONENTS {
+            return Err(DecodeErrors::Format(format!(
+                "Too large quantization number :{quantization_table_number}, expected value between 0 and {MAX_COMPONENTS}"
+            )));
+        }
+        // check that upsampling ratios are powers of two
+        // if these fail, it's probably a corrupt image.
+        if !horizontal_sample.is_power_of_two() {
+            return Err(DecodeErrors::Format(format!(
+                "Horizontal sample is not a power of two({horizontal_sample}) cannot decode"
+            )));
+        }
+
+        if !vertical_sample.is_power_of_two() {
+            return Err(DecodeErrors::Format(format!(
+                "Vertical sub-sample is not power of two({vertical_sample}) cannot decode"
+            )));
+        }
+
+        trace!(
+            "Component ID:{:?} \tHS:{} VS:{} QT:{}",
+            id,
+            horizontal_sample,
+            vertical_sample,
+            quantization_table_number
+        );
+
+        Ok(Components {
+            component_id: id,
+            vertical_sample,
+            horizontal_sample,
+            quantization_table_number,
+            first_row_upsample_dest: vec![],
+            // These two will be set with sof marker
+            dc_huff_table: 0,
+            ac_huff_table: 0,
+            quantization_table: [0; 64],
+            dc_pred: 0,
+            up_sampler: upsample_no_op,
+            // set later
+            width_stride: horizontal_sample,
+            id: a[0],
+            needed: true,
+            raw_coeff: vec![],
+            upsample_dest: vec![],
+            row_up: vec![],
+            row: vec![],
+            idct_pos: 0,
+            x: 0,
+            y: 0,
+            w2: 0,
+            sample_ratio: SampleRatios::None,
+            fix_an_annoying_bug: 1
+        })
+    }
+    /// Setup space for upsampling
+    ///
+    /// During upsample, we need a reference of the last row so that upsampling can
+    /// proceed correctly,
+    /// so we store the last line of every scanline and use it for the next upsampling procedure
+    /// to store this, but since we don't need it for 1v1 upsampling,
+    /// we only call this for routines that need upsampling
+    ///
+    /// # Requirements
+    ///  - width stride of this element is set for the component.
+    pub fn setup_upsample_scanline(&mut self) {
+        self.row = vec![0; self.width_stride * self.vertical_sample];
+        self.row_up = vec![0; self.width_stride * self.vertical_sample];
+        self.first_row_upsample_dest =
+            vec![128; self.vertical_sample * self.width_stride * self.sample_ratio.sample()];
+        self.upsample_dest =
+            vec![0; self.width_stride * self.sample_ratio.sample() * self.fix_an_annoying_bug * 8];
+    }
+}
+
+/// Component ID's
+#[derive(Copy, Debug, Clone, PartialEq, Eq)]
+pub enum ComponentID {
+    /// Luminance channel
+    Y,
+    /// Blue chrominance
+    Cb,
+    /// Red chrominance
+    Cr,
+    /// Q or fourth component
+    Q
+}
+
+#[derive(Copy, Debug, Clone, PartialEq, Eq)]
+pub enum SampleRatios {
+    HV,
+    V,
+    H,
+    None
+}
+
+impl SampleRatios {
+    pub fn sample(self) -> usize {
+        match self {
+            SampleRatios::HV => 4,
+            SampleRatios::V | SampleRatios::H => 2,
+            SampleRatios::None => 1
+        }
+    }
+}
--- a/third_party/zune-jpeg/src/decoder.rs
+++ b/third_party/zune-jpeg/src/decoder.rs
@ -0,0 +1,910 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+//! Main image logic.
+#![allow(clippy::doc_markdown)]
+
+use alloc::string::ToString;
+use alloc::vec::Vec;
+use alloc::{format, vec};
+
+use zune_core::bytestream::{ZByteReaderTrait, ZReader};
+use zune_core::colorspace::ColorSpace;
+use zune_core::log::{error, trace, warn};
+use zune_core::options::DecoderOptions;
+
+use crate::color_convert::choose_ycbcr_to_rgb_convert_func;
+use crate::components::{Components, SampleRatios};
+use crate::errors::{DecodeErrors, UnsupportedSchemes};
+use crate::headers::{
+    parse_app1, parse_app14, parse_app2, parse_dqt, parse_huffman, parse_sos, parse_start_of_frame
+};
+use crate::huffman::HuffmanTable;
+use crate::idct::choose_idct_func;
+use crate::marker::Marker;
+use crate::misc::SOFMarkers;
+use crate::upsampler::{
+    choose_horizontal_samp_function, choose_hv_samp_function, choose_v_samp_function,
+    upsample_no_op
+};
+
+/// Maximum components
+pub(crate) const MAX_COMPONENTS: usize = 4;
+
+/// Maximum image dimensions supported.
+pub(crate) const MAX_DIMENSIONS: usize = 1 << 27;
+
+/// Color conversion function that can convert YCbCr colorspace to RGB(A/X) for
+/// 16 values
+///
+/// The following are guarantees to the following functions
+///
+/// 1. The `&[i16]` slices passed contain 16 items
+///
+/// 2. The slices passed are in the following order
+///     `y,cb,cr`
+///
+/// 3. `&mut [u8]` is zero initialized
+///
+/// 4. `&mut usize` points to the position in the array where new values should
+/// be used
+///
+/// The pointer should
+/// 1. Carry out color conversion
+/// 2. Update `&mut usize` with the new position
+
+pub type ColorConvert16Ptr = fn(&[i16; 16], &[i16; 16], &[i16; 16], &mut [u8], &mut usize);
+
+/// IDCT  function prototype
+///
+/// This encapsulates a dequantize and IDCT function which will carry out the
+/// following functions
+///
+/// Multiply each 64 element block of `&mut [i16]` with `&Aligned32<[i32;64]>`
+/// Carry out IDCT (type 3 dct) on ach block of 64 i16's
+pub type IDCTPtr = fn(&mut [i32; 64], &mut [i16], usize);
+
+/// An encapsulation of an ICC chunk
+pub(crate) struct ICCChunk {
+    pub(crate) seq_no:      u8,
+    pub(crate) num_markers: u8,
+    pub(crate) data:        Vec<u8>
+}
+
+/// A JPEG Decoder Instance.
+#[allow(clippy::upper_case_acronyms, clippy::struct_excessive_bools)]
+pub struct JpegDecoder<T: ZByteReaderTrait> {
+    /// Struct to hold image information from SOI
+    pub(crate) info:              ImageInfo,
+    ///  Quantization tables, will be set to none and the tables will
+    /// be moved to `components` field
+    pub(crate) qt_tables:         [Option<[i32; 64]>; MAX_COMPONENTS],
+    /// DC Huffman Tables with a maximum of 4 tables for each  component
+    pub(crate) dc_huffman_tables: [Option<HuffmanTable>; MAX_COMPONENTS],
+    /// AC Huffman Tables with a maximum of 4 tables for each component
+    pub(crate) ac_huffman_tables: [Option<HuffmanTable>; MAX_COMPONENTS],
+    /// Image components, holds information like DC prediction and quantization
+    /// tables of a component
+    pub(crate) components:        Vec<Components>,
+    /// maximum horizontal component of all channels in the image
+    pub(crate) h_max:             usize,
+    // maximum vertical component of all channels in the image
+    pub(crate) v_max:             usize,
+    /// mcu's  width (interleaved scans)
+    pub(crate) mcu_width:         usize,
+    /// MCU height(interleaved scans
+    pub(crate) mcu_height:        usize,
+    /// Number of MCU's in the x plane
+    pub(crate) mcu_x:             usize,
+    /// Number of MCU's in the y plane
+    pub(crate) mcu_y:             usize,
+    /// Is the image interleaved?
+    pub(crate) is_interleaved:    bool,
+    pub(crate) sub_sample_ratio:  SampleRatios,
+    /// Image input colorspace, should be YCbCr for a sane image, might be
+    /// grayscale too
+    pub(crate) input_colorspace:  ColorSpace,
+    // Progressive image details
+    /// Is the image progressive?
+    pub(crate) is_progressive:    bool,
+
+    /// Start of spectral scan
+    pub(crate) spec_start:       u8,
+    /// End of spectral scan
+    pub(crate) spec_end:         u8,
+    /// Successive approximation bit position high
+    pub(crate) succ_high:        u8,
+    /// Successive approximation bit position low
+    pub(crate) succ_low:         u8,
+    /// Number of components.
+    pub(crate) num_scans:        u8,
+    // Function pointers, for pointy stuff.
+    /// Dequantize and idct function
+    // This is determined at runtime which function to run, statically it's
+    // initialized to a platform independent one and during initialization
+    // of this struct, we check if we can switch to a faster one which
+    // depend on certain CPU extensions.
+    pub(crate) idct_func: IDCTPtr,
+    // Color convert function which acts on 16 YCbCr values
+    pub(crate) color_convert_16: ColorConvert16Ptr,
+    pub(crate) z_order:          [usize; MAX_COMPONENTS],
+    /// restart markers
+    pub(crate) restart_interval: usize,
+    pub(crate) todo:             usize,
+    // decoder options
+    pub(crate) options:          DecoderOptions,
+    // byte-stream
+    pub(crate) stream:           ZReader<T>,
+    // Indicate whether headers have been decoded
+    pub(crate) headers_decoded:  bool,
+    pub(crate) seen_sof:         bool,
+    // exif data, lifted from app2
+    pub(crate) exif_data:        Option<Vec<u8>>,
+
+    pub(crate) icc_data: Vec<ICCChunk>,
+    pub(crate) is_mjpeg: bool,
+    pub(crate) coeff:    usize // Solves some weird bug :)
+}
+
+impl<T> JpegDecoder<T>
+where
+    T: ZByteReaderTrait
+{
+    #[allow(clippy::redundant_field_names)]
+    fn default(options: DecoderOptions, buffer: T) -> Self {
+        let color_convert = choose_ycbcr_to_rgb_convert_func(ColorSpace::RGB, &options).unwrap();
+        JpegDecoder {
+            info:              ImageInfo::default(),
+            qt_tables:         [None, None, None, None],
+            dc_huffman_tables: [None, None, None, None],
+            ac_huffman_tables: [None, None, None, None],
+            components:        vec![],
+            // Interleaved information
+            h_max:             1,
+            v_max:             1,
+            mcu_height:        0,
+            mcu_width:         0,
+            mcu_x:             0,
+            mcu_y:             0,
+            is_interleaved:    false,
+            sub_sample_ratio:  SampleRatios::None,
+            is_progressive:    false,
+            spec_start:        0,
+            spec_end:          0,
+            succ_high:         0,
+            succ_low:          0,
+            num_scans:         0,
+            idct_func:         choose_idct_func(&options),
+            color_convert_16:  color_convert,
+            input_colorspace:  ColorSpace::YCbCr,
+            z_order:           [0; MAX_COMPONENTS],
+            restart_interval:  0,
+            todo:              0x7fff_ffff,
+            options:           options,
+            stream:            ZReader::new(buffer),
+            headers_decoded:   false,
+            seen_sof:          false,
+            exif_data:         None,
+            icc_data:          vec![],
+            is_mjpeg:          false,
+            coeff:             1
+        }
+    }
+    /// Decode a buffer already in memory
+    ///
+    /// The buffer should be a valid jpeg file, perhaps created by the command
+    /// `std:::fs::read()` or a JPEG file downloaded from the internet.
+    ///
+    /// # Errors
+    /// See DecodeErrors for an explanation
+    pub fn decode(&mut self) -> Result<Vec<u8>, DecodeErrors> {
+        self.decode_headers()?;
+        let size = self.output_buffer_size().unwrap();
+        let mut out = vec![0; size];
+        self.decode_into(&mut out)?;
+        Ok(out)
+    }
+
+    /// Create a new Decoder instance
+    ///
+    /// # Arguments
+    ///  - `stream`: The raw bytes of a jpeg file.
+    #[must_use]
+    #[allow(clippy::new_without_default)]
+    pub fn new(stream: T) -> JpegDecoder<T> {
+        JpegDecoder::default(DecoderOptions::default(), stream)
+    }
+
+    /// Returns the image information
+    ///
+    /// This **must** be called after a subsequent call to [`decode`] or [`decode_headers`]
+    /// it will return `None`
+    ///
+    /// # Returns
+    /// - `Some(info)`: Image information,width, height, number of components
+    /// - None: Indicates image headers haven't been decoded
+    ///
+    /// [`decode`]: JpegDecoder::decode
+    /// [`decode_headers`]: JpegDecoder::decode_headers
+    #[must_use]
+    pub fn info(&self) -> Option<ImageInfo> {
+        // we check for fails to that call by comparing what we have to the default, if
+        // it's default we assume that the caller failed to uphold the
+        // guarantees. We can be sure that an image cannot be the default since
+        // its a hard panic in-case width or height are set to zero.
+        if !self.headers_decoded {
+            return None;
+        }
+
+        return Some(self.info.clone());
+    }
+
+    /// Return the number of bytes required to hold a decoded image frame
+    /// decoded using the given input transformations
+    ///
+    /// # Returns
+    ///  - `Some(usize)`: Minimum size for a buffer needed to decode the image
+    ///  - `None`: Indicates the image was not decoded, or image dimensions would overflow a usize
+    ///
+    #[must_use]
+    pub fn output_buffer_size(&self) -> Option<usize> {
+        return if self.headers_decoded {
+            Some(
+                usize::from(self.width())
+                    .checked_mul(usize::from(self.height()))?
+                    .checked_mul(self.options.jpeg_get_out_colorspace().num_components())?
+            )
+        } else {
+            None
+        };
+    }
+
+    /// Get an immutable reference to the decoder options
+    /// for the decoder instance
+    ///
+    /// This can be used to modify options before actual decoding
+    /// but after initial creation
+    ///
+    /// # Example
+    /// ```no_run
+    /// use zune_core::bytestream::ZCursor;
+    /// use zune_jpeg::JpegDecoder;
+    ///
+    /// let mut decoder = JpegDecoder::new(ZCursor::new(&[]));
+    /// // get current options
+    /// let mut options = decoder.options();
+    /// // modify it
+    ///  let new_options = options.set_max_width(10);
+    /// // set it back
+    /// decoder.set_options(new_options);
+    ///
+    /// ```
+    #[must_use]
+    pub const fn options(&self) -> &DecoderOptions {
+        &self.options
+    }
+    /// Return the input colorspace of the image
+    ///
+    /// This indicates the colorspace that is present in
+    /// the image, but this may be different to the colorspace that
+    /// the output will be transformed to
+    ///
+    /// # Returns
+    /// -`Some(Colorspace)`: Input colorspace
+    /// - None : Indicates the headers weren't decoded
+    #[must_use]
+    pub fn input_colorspace(&self) -> Option<ColorSpace> {
+        return if self.headers_decoded { Some(self.input_colorspace) } else { None };
+    }
+    /// Set decoder options
+    ///
+    /// This can be used to set new options even after initialization
+    /// but before decoding.
+    ///
+    /// This does not bear any significance after decoding an image
+    ///
+    /// # Arguments
+    /// - `options`: New decoder options
+    ///
+    /// # Example
+    /// Set maximum jpeg progressive passes to be 4
+    ///
+    /// ```no_run
+    /// use zune_core::bytestream::ZCursor;
+    /// use zune_jpeg::JpegDecoder;
+    /// let mut decoder =JpegDecoder::new(ZCursor::new(&[]));
+    /// // this works also because DecoderOptions implements `Copy`
+    /// let options = decoder.options().jpeg_set_max_scans(4);
+    /// // set the new options
+    /// decoder.set_options(options);
+    /// // now decode
+    /// decoder.decode().unwrap();
+    /// ```
+    pub fn set_options(&mut self, options: DecoderOptions) {
+        self.options = options;
+    }
+    /// Decode Decoder headers
+    ///
+    /// This routine takes care of parsing supported headers from a Decoder
+    /// image
+    ///
+    /// # Supported Headers
+    ///  - APP(0)
+    ///  - SOF(O)
+    ///  - DQT -> Quantization tables
+    ///  - DHT -> Huffman tables
+    ///  - SOS -> Start of Scan
+    /// # Unsupported Headers
+    ///  - SOF(n) -> Decoder images which are not baseline/progressive
+    ///  - DAC -> Images using Arithmetic tables
+    ///  - JPG(n)
+    fn decode_headers_internal(&mut self) -> Result<(), DecodeErrors> {
+        if self.headers_decoded {
+            trace!("Headers decoded!");
+            return Ok(());
+        }
+        // match output colorspace here
+        // we know this will only be called once per image
+        // so makes sense
+        // We only care for ycbcr to rgb/rgba here
+        // in case one is using another colorspace.
+        // May god help you
+        let out_colorspace = self.options.jpeg_get_out_colorspace();
+
+        if matches!(
+            out_colorspace,
+            ColorSpace::BGR | ColorSpace::BGRA | ColorSpace::RGB | ColorSpace::RGBA
+        ) {
+            self.color_convert_16 = choose_ycbcr_to_rgb_convert_func(
+                self.options.jpeg_get_out_colorspace(),
+                &self.options
+            )
+            .unwrap();
+        }
+        // First two bytes should be jpeg soi marker
+        let magic_bytes = self.stream.get_u16_be_err()?;
+
+        let mut last_byte = 0;
+        let mut bytes_before_marker = 0;
+
+        if magic_bytes != 0xffd8 {
+            return Err(DecodeErrors::IllegalMagicBytes(magic_bytes));
+        }
+
+        loop {
+            // read a byte
+            let mut m = self.stream.read_u8_err()?;
+
+            // AND OF COURSE some images will have fill bytes in their marker
+            // bitstreams because why not.
+            //
+            // I am disappointed as a man.
+            if (m == 0xFF || m == 0) && last_byte == 0xFF {
+                // This handles the edge case where
+                // images have markers with fill bytes(0xFF)
+                // or byte stuffing (0)
+                // I.e 0xFF 0xFF 0xDA
+                // and
+                // 0xFF 0 0xDA
+                // It should ignore those fill bytes and take 0xDA
+                // I don't know why such images exist
+                // but they do.
+                // so this is for you (with love)
+                while m == 0xFF || m == 0x0 {
+                    last_byte = m;
+                    m = self.stream.read_u8_err()?;
+                }
+            }
+            // Last byte should be 0xFF to confirm existence of a marker since markers look
+            // like OxFF(some marker data)
+            if last_byte == 0xFF {
+                let marker = Marker::from_u8(m);
+                if let Some(n) = marker {
+                    if bytes_before_marker > 3 {
+                        if self.options.strict_mode()
+                        /*No reason to use this*/
+                        {
+                            return Err(DecodeErrors::FormatStatic(
+                                "[strict-mode]: Extra bytes between headers"
+                            ));
+                        }
+
+                        error!(
+                            "Extra bytes {} before marker 0xFF{:X}",
+                            bytes_before_marker - 3,
+                            m
+                        );
+                    }
+
+                    bytes_before_marker = 0;
+
+                    self.parse_marker_inner(n)?;
+
+                    if n == Marker::SOS {
+                        self.headers_decoded = true;
+                        trace!("Input colorspace {:?}", self.input_colorspace);
+                        return Ok(());
+                    }
+                } else {
+                    bytes_before_marker = 0;
+
+                    warn!("Marker 0xFF{:X} not known", m);
+
+                    let length = self.stream.get_u16_be_err()?;
+
+                    if length < 2 {
+                        return Err(DecodeErrors::Format(format!(
+                            "Found a marker with invalid length : {length}"
+                        )));
+                    }
+
+                    warn!("Skipping {} bytes", length - 2);
+                    self.stream.skip((length - 2) as usize)?;
+                }
+            }
+            last_byte = m;
+            bytes_before_marker += 1;
+        }
+    }
+    #[allow(clippy::too_many_lines)]
+    pub(crate) fn parse_marker_inner(&mut self, m: Marker) -> Result<(), DecodeErrors> {
+        match m {
+            Marker::SOF(0..=2) => {
+                let marker = {
+                    // choose marker
+                    if m == Marker::SOF(0) || m == Marker::SOF(1) {
+                        SOFMarkers::BaselineDct
+                    } else {
+                        self.is_progressive = true;
+                        SOFMarkers::ProgressiveDctHuffman
+                    }
+                };
+
+                trace!("Image encoding scheme =`{:?}`", marker);
+                // get components
+                parse_start_of_frame(marker, self)?;
+            }
+            // Start of Frame Segments not supported
+            Marker::SOF(v) => {
+                let feature = UnsupportedSchemes::from_int(v);
+
+                if let Some(feature) = feature {
+                    return Err(DecodeErrors::Unsupported(feature));
+                }
+
+                return Err(DecodeErrors::Format("Unsupported image format".to_string()));
+            }
+            //APP(0) segment
+            Marker::APP(0) => {
+                let mut length = self.stream.get_u16_be_err()?;
+
+                if length < 2 {
+                    return Err(DecodeErrors::Format(format!(
+                        "Found a marker with invalid length:{length}\n"
+                    )));
+                }
+                // skip for now
+                if length > 5 {
+                    let mut buffer = [0u8; 5];
+                    self.stream.read_exact_bytes(&mut buffer)?;
+                    if &buffer == b"AVI1\0" {
+                        self.is_mjpeg = true;
+                    }
+                    length -= 5;
+                }
+
+                self.stream.skip(length.saturating_sub(2) as usize)?;
+
+                //parse_app(buf, m, &mut self.info)?;
+            }
+            Marker::APP(1) => {
+                parse_app1(self)?;
+            }
+
+            Marker::APP(2) => {
+                parse_app2(self)?;
+            }
+            // Quantization tables
+            Marker::DQT => {
+                parse_dqt(self)?;
+            }
+            // Huffman tables
+            Marker::DHT => {
+                parse_huffman(self)?;
+            }
+            // Start of Scan Data
+            Marker::SOS => {
+                parse_sos(self)?;
+
+                // break after reading the start of scan.
+                // what follows is the image data
+                return Ok(());
+            }
+            Marker::EOI => return Err(DecodeErrors::FormatStatic("Premature End of image")),
+
+            Marker::DAC | Marker::DNL => {
+                return Err(DecodeErrors::Format(format!(
+                    "Parsing of the following header `{m:?}` is not supported,\
+                                cannot continue"
+                )));
+            }
+            Marker::DRI => {
+                trace!("DRI marker present");
+
+                if self.stream.get_u16_be_err()? != 4 {
+                    return Err(DecodeErrors::Format(
+                        "Bad DRI length, Corrupt JPEG".to_string()
+                    ));
+                }
+
+                self.restart_interval = usize::from(self.stream.get_u16_be_err()?);
+                self.todo = self.restart_interval;
+            }
+            Marker::APP(14) => {
+                parse_app14(self)?;
+            }
+            _ => {
+                warn!(
+                    "Capabilities for processing marker \"{:?}\" not implemented",
+                    m
+                );
+
+                let length = self.stream.get_u16_be_err()?;
+
+                if length < 2 {
+                    return Err(DecodeErrors::Format(format!(
+                        "Found a marker with invalid length:{length}\n"
+                    )));
+                }
+                warn!("Skipping {} bytes", length - 2);
+                self.stream.skip((length - 2) as usize)?;
+            }
+        }
+        Ok(())
+    }
+    /// Get the embedded ICC profile if it exists
+    /// and is correct
+    ///
+    /// One needs not to decode the whole image to extract this,
+    /// calling [`decode_headers`] for an image with an ICC profile
+    /// allows you to decode this
+    ///
+    /// # Returns
+    /// - `Some(Vec<u8>)`: The raw ICC profile of the image
+    /// - `None`: May indicate an error  in the ICC profile , non-existence of
+    /// an ICC profile, or that the headers weren't decoded.
+    ///
+    /// [`decode_headers`]:Self::decode_headers
+    #[must_use]
+    pub fn icc_profile(&self) -> Option<Vec<u8>> {
+        let mut marker_present: [Option<&ICCChunk>; 256] = [None; 256];
+
+        if !self.headers_decoded {
+            return None;
+        }
+        let num_markers = self.icc_data.len();
+
+        if num_markers == 0 || num_markers >= 255 {
+            return None;
+        }
+        // check validity
+        for chunk in &self.icc_data {
+            if usize::from(chunk.num_markers) != num_markers {
+                // all the lengths must match
+                return None;
+            }
+            if chunk.seq_no == 0 {
+                warn!("Zero sequence number in ICC, corrupt ICC chunk");
+                return None;
+            }
+            if marker_present[usize::from(chunk.seq_no)].is_some() {
+                // duplicate seq_no
+                warn!("Duplicate sequence number in ICC, corrupt chunk");
+                return None;
+            }
+
+            marker_present[usize::from(chunk.seq_no)] = Some(chunk);
+        }
+        let mut data = Vec::with_capacity(1000);
+        // assemble the data now
+        for chunk in marker_present.get(1..=num_markers).unwrap() {
+            if let Some(ch) = chunk {
+                data.extend_from_slice(&ch.data);
+            } else {
+                warn!("Missing icc sequence number, corrupt ICC chunk ");
+                return None;
+            }
+        }
+
+        Some(data)
+    }
+    /// Return the exif data for the file
+    ///
+    /// This returns the raw exif data starting at the
+    /// TIFF header
+    ///
+    /// # Returns
+    /// -`Some(data)`: The raw exif data, if present in the image
+    /// - None: May indicate the following
+    ///
+    ///    1. The image doesn't have exif data
+    ///    2. The image headers haven't been decoded
+    #[must_use]
+    pub fn exif(&self) -> Option<&Vec<u8>> {
+        return self.exif_data.as_ref();
+    }
+    /// Get the output colorspace the image pixels will be decoded into
+    ///
+    ///
+    /// # Note.
+    /// This field can only be regarded after decoding headers,
+    /// as markers such as Adobe APP14 may dictate different colorspaces
+    /// than requested.
+    ///
+    /// Calling `decode_headers` is sufficient to know what colorspace the
+    /// output is, if this is called after `decode` it indicates the colorspace
+    /// the output is currently in
+    ///
+    /// Additionally not all input->output colorspace mappings are supported
+    /// but all input colorspaces can map to RGB colorspace, so that's a safe bet
+    /// if one is handling image formats
+    ///
+    ///# Returns
+    /// - `Some(Colorspace)`: If headers have been decoded, the colorspace the
+    ///output array will be in
+    ///- `None
+    #[must_use]
+    pub fn output_colorspace(&self) -> Option<ColorSpace> {
+        return if self.headers_decoded {
+            Some(self.options.jpeg_get_out_colorspace())
+        } else {
+            None
+        };
+    }
+
+    /// Decode into a pre-allocated buffer
+    ///
+    /// It is an error if the buffer size is smaller than
+    /// [`output_buffer_size()`](Self::output_buffer_size)
+    ///
+    /// If the buffer is bigger than expected, we ignore the end padding bytes
+    ///
+    /// # Example
+    ///
+    /// - Read  headers and then alloc a buffer big enough to hold the image
+    ///
+    /// ```no_run
+    /// use zune_core::bytestream::ZCursor;
+    /// use zune_jpeg::JpegDecoder;
+    /// let mut decoder = JpegDecoder::new(ZCursor::new(&[]));
+    /// // before we get output, we must decode the headers to get width
+    /// // height, and input colorspace
+    /// decoder.decode_headers().unwrap();
+    ///
+    /// let mut out = vec![0;decoder.output_buffer_size().unwrap()];
+    /// // write into out
+    /// decoder.decode_into(&mut out).unwrap();
+    /// ```
+    ///
+    ///
+    pub fn decode_into(&mut self, out: &mut [u8]) -> Result<(), DecodeErrors> {
+        self.decode_headers_internal()?;
+
+        let expected_size = self.output_buffer_size().unwrap();
+
+        if out.len() < expected_size {
+            // too small of a size
+            return Err(DecodeErrors::TooSmallOutput(expected_size, out.len()));
+        }
+
+        // ensure we don't touch anyone else's scratch space
+        let out_len = core::cmp::min(out.len(), expected_size);
+        let out = &mut out[0..out_len];
+
+        if self.is_progressive {
+            self.decode_mcu_ycbcr_progressive(out)
+        } else {
+            self.decode_mcu_ycbcr_baseline(out)
+        }
+    }
+
+    /// Read only headers from a jpeg image buffer
+    ///
+    /// This allows you to extract important information like
+    /// image width and height without decoding the full image
+    ///
+    /// # Examples
+    /// ```no_run
+    /// use zune_core::bytestream::ZCursor;
+    /// use zune_jpeg::{JpegDecoder};
+    ///
+    /// let img_data = std::fs::read("a_valid.jpeg").unwrap();
+    /// let mut decoder = JpegDecoder::new(ZCursor::new(&img_data));
+    /// decoder.decode_headers().unwrap();
+    ///
+    /// println!("Total decoder dimensions are : {:?} pixels",decoder.dimensions());
+    /// println!("Number of components in the image are {}", decoder.info().unwrap().components);
+    /// ```
+    /// # Errors
+    /// See DecodeErrors enum for list of possible errors during decoding
+    pub fn decode_headers(&mut self) -> Result<(), DecodeErrors> {
+        self.decode_headers_internal()?;
+        Ok(())
+    }
+    /// Create a new decoder with the specified options to be used for decoding
+    /// an image
+    ///
+    /// # Arguments
+    /// - `buf`: The input buffer from where we will pull in compressed jpeg bytes from
+    /// - `options`: Options specific to this decoder instance
+    #[must_use]
+    pub fn new_with_options(buf: T, options: DecoderOptions) -> JpegDecoder<T> {
+        JpegDecoder::default(options, buf)
+    }
+
+    /// Set up-sampling routines in case an image is down sampled
+    pub(crate) fn set_upsampling(&mut self) -> Result<(), DecodeErrors> {
+        // no sampling, return early
+        // check if horizontal max ==1
+        if self.h_max == self.v_max && self.h_max == 1 {
+            return Ok(());
+        }
+        match (self.h_max, self.v_max) {
+            (1, 1) => {
+                self.sub_sample_ratio = SampleRatios::None;
+            }
+            (1, 2) => {
+                self.sub_sample_ratio = SampleRatios::V;
+            }
+            (2, 1) => {
+                self.sub_sample_ratio = SampleRatios::H;
+            }
+            (2, 2) => {
+                self.sub_sample_ratio = SampleRatios::HV;
+            }
+            _ => {
+                return Err(DecodeErrors::Format(
+                    "Unknown down-sampling method, cannot continue".to_string()
+                ))
+            }
+        }
+
+        for comp in &mut self.components {
+            let hs = self.h_max / comp.horizontal_sample;
+            let vs = self.v_max / comp.vertical_sample;
+
+            let samp_factor = match (hs, vs) {
+                (1, 1) => {
+                    comp.sample_ratio = SampleRatios::None;
+                    upsample_no_op
+                }
+                (2, 1) => {
+                    comp.sample_ratio = SampleRatios::H;
+                    choose_horizontal_samp_function(self.options.use_unsafe())
+                }
+                (1, 2) => {
+                    comp.sample_ratio = SampleRatios::V;
+                    choose_v_samp_function(self.options.use_unsafe())
+                }
+                (2, 2) => {
+                    comp.sample_ratio = SampleRatios::HV;
+                    choose_hv_samp_function(self.options.use_unsafe())
+                }
+                _ => {
+                    return Err(DecodeErrors::Format(
+                        "Unknown down-sampling method, cannot continue".to_string()
+                    ))
+                }
+            };
+            comp.setup_upsample_scanline();
+            comp.up_sampler = samp_factor;
+        }
+
+        return Ok(());
+    }
+    #[must_use]
+    /// Get the width of the image as a u16
+    ///
+    /// The width lies between 1 and 65535
+    pub(crate) fn width(&self) -> u16 {
+        self.info.width
+    }
+
+    /// Get the height of the image as a u16
+    ///
+    /// The height lies between 1 and 65535
+    #[must_use]
+    pub(crate) fn height(&self) -> u16 {
+        self.info.height
+    }
+
+    /// Get image dimensions as a tuple of width and height
+    /// or `None` if the image hasn't been decoded.
+    ///
+    /// # Returns
+    /// - `Some(width,height)`: Image dimensions
+    /// -  None : The image headers haven't been decoded
+    #[must_use]
+    pub const fn dimensions(&self) -> Option<(usize, usize)> {
+        return if self.headers_decoded {
+            Some((self.info.width as usize, self.info.height as usize))
+        } else {
+            None
+        };
+    }
+}
+
+/// A struct representing Image Information
+#[derive(Default, Clone, Eq, PartialEq)]
+#[allow(clippy::module_name_repetitions)]
+pub struct ImageInfo {
+    /// Width of the image
+    pub width:         u16,
+    /// Height of image
+    pub height:        u16,
+    /// PixelDensity
+    pub pixel_density: u8,
+    /// Start of frame markers
+    pub sof:           SOFMarkers,
+    /// Horizontal sample
+    pub x_density:     u16,
+    /// Vertical sample
+    pub y_density:     u16,
+    /// Number of components
+    pub components:    u8
+}
+
+impl ImageInfo {
+    /// Set width of the image
+    ///
+    /// Found in the start of frame
+
+    pub(crate) fn set_width(&mut self, width: u16) {
+        self.width = width;
+    }
+
+    /// Set height of the image
+    ///
+    /// Found in the start of frame
+
+    pub(crate) fn set_height(&mut self, height: u16) {
+        self.height = height;
+    }
+
+    /// Set the image density
+    ///
+    /// Found in the start of frame
+
+    pub(crate) fn set_density(&mut self, density: u8) {
+        self.pixel_density = density;
+    }
+
+    /// Set image Start of frame marker
+    ///
+    /// found in the Start of frame header
+
+    pub(crate) fn set_sof_marker(&mut self, marker: SOFMarkers) {
+        self.sof = marker;
+    }
+
+    /// Set image x-density(dots per pixel)
+    ///
+    /// Found in the APP(0) marker
+    #[allow(dead_code)]
+    pub(crate) fn set_x(&mut self, sample: u16) {
+        self.x_density = sample;
+    }
+
+    /// Set image y-density
+    ///
+    /// Found in the APP(0) marker
+    #[allow(dead_code)]
+    pub(crate) fn set_y(&mut self, sample: u16) {
+        self.y_density = sample;
+    }
+}
--- a/third_party/zune-jpeg/src/errors.rs
+++ b/third_party/zune-jpeg/src/errors.rs
@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+//! Contains most common errors that may be encountered in decoding a Decoder
+//! image
+
+use alloc::string::String;
+use core::fmt::{Debug, Display, Formatter};
+
+use zune_core::bytestream::ZByteIoError;
+
+use crate::misc::{
+    START_OF_FRAME_EXT_AR, START_OF_FRAME_EXT_SEQ, START_OF_FRAME_LOS_SEQ,
+    START_OF_FRAME_LOS_SEQ_AR, START_OF_FRAME_PROG_DCT_AR
+};
+
+/// Common Decode errors
+#[allow(clippy::module_name_repetitions)]
+pub enum DecodeErrors {
+    /// Any other thing we do not know
+    Format(String),
+    /// Any other thing we do not know but we
+    /// don't need to allocate space on the heap
+    FormatStatic(&'static str),
+    /// Illegal Magic Bytes
+    IllegalMagicBytes(u16),
+    /// problems with the Huffman Tables in a Decoder file
+    HuffmanDecode(String),
+    /// Image has zero width
+    ZeroError,
+    /// Discrete Quantization Tables error
+    DqtError(String),
+    /// Start of scan errors
+    SosError(String),
+    /// Start of frame errors
+    SofError(String),
+    /// UnsupportedImages
+    Unsupported(UnsupportedSchemes),
+    /// MCU errors
+    MCUError(String),
+    /// Exhausted data
+    ExhaustedData,
+    /// Large image dimensions(Corrupted data)?
+    LargeDimensions(usize),
+    /// Too small output for size
+    TooSmallOutput(usize, usize),
+
+    IoErrors(ZByteIoError)
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for DecodeErrors {}
+
+impl From<&'static str> for DecodeErrors {
+    fn from(data: &'static str) -> Self {
+        return Self::FormatStatic(data);
+    }
+}
+
+impl From<ZByteIoError> for DecodeErrors {
+    fn from(data: ZByteIoError) -> Self {
+        return Self::IoErrors(data);
+    }
+}
+impl Debug for DecodeErrors {
+    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
+        match &self
+        {
+            Self::Format(ref a) => write!(f, "{a:?}"),
+            Self::FormatStatic(a) => write!(f, "{:?}", &a),
+
+            Self::HuffmanDecode(ref reason) =>
+            {
+                write!(f, "Error decoding huffman values: {reason}")
+            }
+            Self::ZeroError => write!(f, "Image width or height is set to zero, cannot continue"),
+            Self::DqtError(ref reason) => write!(f, "Error parsing DQT segment. Reason:{reason}"),
+            Self::SosError(ref reason) => write!(f, "Error parsing SOS Segment. Reason:{reason}"),
+            Self::SofError(ref reason) => write!(f, "Error parsing SOF segment. Reason:{reason}"),
+            Self::IllegalMagicBytes(bytes) =>
+            {
+                write!(f, "Error parsing image. Illegal start bytes:{bytes:X}")
+            }
+            Self::MCUError(ref reason) => write!(f, "Error in decoding MCU. Reason {reason}"),
+            Self::Unsupported(ref image_type) =>
+                {
+                    write!(f, "{image_type:?}")
+                }
+            Self::ExhaustedData => write!(f, "Exhausted data in the image"),
+            Self::LargeDimensions(ref dimensions) => write!(
+                f,
+                "Too large dimensions {dimensions},library supports up to {}", crate::decoder::MAX_DIMENSIONS
+            ),
+            Self::TooSmallOutput(expected, found) => write!(f, "Too small output, expected buffer with at least {expected} bytes but got one with {found} bytes"),
+            Self::IoErrors(error)=>write!(f,"I/O errors {error:?}"),
+        }
+    }
+}
+
+impl Display for DecodeErrors {
+    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
+        write!(f, "{self:?}")
+    }
+}
+
+/// Contains Unsupported/Yet-to-be supported Decoder image encoding types.
+#[derive(Eq, PartialEq, Copy, Clone)]
+pub enum UnsupportedSchemes {
+    /// SOF_1 Extended sequential DCT,Huffman coding
+    ExtendedSequentialHuffman,
+    /// Lossless (sequential), huffman coding,
+    LosslessHuffman,
+    /// Extended sequential DEC, arithmetic coding
+    ExtendedSequentialDctArithmetic,
+    /// Progressive DCT, arithmetic coding,
+    ProgressiveDctArithmetic,
+    /// Lossless ( sequential), arithmetic coding
+    LosslessArithmetic
+}
+
+impl Debug for UnsupportedSchemes {
+    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
+        match &self {
+            Self::ExtendedSequentialHuffman => {
+                write!(f, "The library cannot yet decode images encoded using Extended Sequential Huffman  encoding scheme yet.")
+            }
+            Self::LosslessHuffman => {
+                write!(f, "The library cannot yet decode images encoded with Lossless Huffman encoding scheme")
+            }
+            Self::ExtendedSequentialDctArithmetic => {
+                write!(f,"The library cannot yet decode Images Encoded with Extended Sequential DCT Arithmetic scheme")
+            }
+            Self::ProgressiveDctArithmetic => {
+                write!(f,"The library cannot yet decode images encoded with Progressive DCT Arithmetic scheme")
+            }
+            Self::LosslessArithmetic => {
+                write!(f,"The library cannot yet decode images encoded with Lossless Arithmetic encoding scheme")
+            }
+        }
+    }
+}
+
+impl UnsupportedSchemes {
+    #[must_use]
+    /// Create an unsupported scheme from an integer
+    ///
+    /// # Returns
+    /// `Some(UnsupportedScheme)` if the int refers to a specific scheme,
+    /// otherwise returns `None`
+    pub fn from_int(int: u8) -> Option<UnsupportedSchemes> {
+        let int = u16::from_be_bytes([0xff, int]);
+
+        match int {
+            START_OF_FRAME_PROG_DCT_AR => Some(Self::ProgressiveDctArithmetic),
+            START_OF_FRAME_LOS_SEQ => Some(Self::LosslessHuffman),
+            START_OF_FRAME_LOS_SEQ_AR => Some(Self::LosslessArithmetic),
+            START_OF_FRAME_EXT_SEQ => Some(Self::ExtendedSequentialHuffman),
+            START_OF_FRAME_EXT_AR => Some(Self::ExtendedSequentialDctArithmetic),
+            _ => None
+        }
+    }
+}
--- a/third_party/zune-jpeg/src/headers.rs
+++ b/third_party/zune-jpeg/src/headers.rs
@ -0,0 +1,544 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+//! Decode Decoder markers/segments
+//!
+//! This file deals with decoding header information in a jpeg file
+//!
+use alloc::format;
+use alloc::string::ToString;
+use alloc::vec::Vec;
+
+use zune_core::bytestream::ZByteReaderTrait;
+use zune_core::colorspace::ColorSpace;
+use zune_core::log::{debug, error, trace, warn};
+
+use crate::components::Components;
+use crate::decoder::{ICCChunk, JpegDecoder, MAX_COMPONENTS};
+use crate::errors::DecodeErrors;
+use crate::huffman::HuffmanTable;
+use crate::misc::{SOFMarkers, UN_ZIGZAG};
+
+///**B.2.4.2 Huffman table-specification syntax**
+#[allow(clippy::similar_names, clippy::cast_sign_loss)]
+pub(crate) fn parse_huffman<T: ZByteReaderTrait>(
+    decoder: &mut JpegDecoder<T>
+) -> Result<(), DecodeErrors>
+where
+{
+    // Read the length of the Huffman table
+    let mut dht_length = i32::from(decoder.stream.get_u16_be_err()?.checked_sub(2).ok_or(
+        DecodeErrors::FormatStatic("Invalid Huffman length in image")
+    )?);
+
+    while dht_length > 16 {
+        // HT information
+        let ht_info = decoder.stream.read_u8_err()?;
+        // third bit indicates whether the huffman encoding is DC or AC type
+        let dc_or_ac = (ht_info >> 4) & 0xF;
+        // Indicate the position of this table, should be less than 4;
+        let index = (ht_info & 0xF) as usize;
+        // read the number of symbols
+        let mut num_symbols: [u8; 17] = [0; 17];
+
+        if index >= MAX_COMPONENTS {
+            return Err(DecodeErrors::HuffmanDecode(format!(
+                "Invalid DHT index {index}, expected between 0 and 3"
+            )));
+        }
+
+        if dc_or_ac > 1 {
+            return Err(DecodeErrors::HuffmanDecode(format!(
+                "Invalid DHT position {dc_or_ac}, should be 0 or 1"
+            )));
+        }
+
+        decoder.stream.read_exact_bytes(&mut num_symbols[1..17])?;
+
+        dht_length -= 1 + 16;
+
+        let symbols_sum: i32 = num_symbols.iter().map(|f| i32::from(*f)).sum();
+
+        // The sum of the number of symbols cannot be greater than 256;
+        if symbols_sum > 256 {
+            return Err(DecodeErrors::FormatStatic(
+                "Encountered Huffman table with excessive length in DHT"
+            ));
+        }
+        if symbols_sum > dht_length {
+            return Err(DecodeErrors::HuffmanDecode(format!(
+                "Excessive Huffman table of length {symbols_sum} found when header length is {dht_length}"
+            )));
+        }
+        dht_length -= symbols_sum;
+        // A table containing symbols in increasing code length
+        let mut symbols = [0; 256];
+
+        decoder
+            .stream
+            .read_exact_bytes(&mut symbols[0..(symbols_sum as usize)])?;
+        // store
+        match dc_or_ac {
+            0 => {
+                decoder.dc_huffman_tables[index] = Some(HuffmanTable::new(
+                    &num_symbols,
+                    symbols,
+                    true,
+                    decoder.is_progressive
+                )?);
+            }
+            _ => {
+                decoder.ac_huffman_tables[index] = Some(HuffmanTable::new(
+                    &num_symbols,
+                    symbols,
+                    false,
+                    decoder.is_progressive
+                )?);
+            }
+        }
+    }
+
+    if dht_length > 0 {
+        return Err(DecodeErrors::FormatStatic("Bogus Huffman table definition"));
+    }
+
+    Ok(())
+}
+
+///**B.2.4.1 Quantization table-specification syntax**
+#[allow(clippy::cast_possible_truncation, clippy::needless_range_loop)]
+pub(crate) fn parse_dqt<T: ZByteReaderTrait>(img: &mut JpegDecoder<T>) -> Result<(), DecodeErrors> {
+    // read length
+    let mut qt_length =
+        img.stream
+            .get_u16_be_err()?
+            .checked_sub(2)
+            .ok_or(DecodeErrors::FormatStatic(
+                "Invalid DQT length. Length should be greater than 2"
+            ))?;
+    // A single DQT header may have multiple QT's
+    while qt_length > 0 {
+        let qt_info = img.stream.read_u8_err()?;
+        // 0 = 8 bit otherwise 16 bit dqt
+        let precision = (qt_info >> 4) as usize;
+        // last 4 bits give us position
+        let table_position = (qt_info & 0x0f) as usize;
+        let precision_value = 64 * (precision + 1);
+
+        if (precision_value + 1) as u16 > qt_length {
+            return Err(DecodeErrors::DqtError(format!("Invalid QT table bytes left :{}. Too small to construct a valid qt table which should be {} long", qt_length, precision_value + 1)));
+        }
+
+        let dct_table = match precision {
+            0 => {
+                let mut qt_values = [0; 64];
+
+                img.stream.read_exact_bytes(&mut qt_values)?;
+
+                qt_length -= (precision_value as u16) + 1 /*QT BIT*/;
+                // carry out un zig-zag here
+                un_zig_zag(&qt_values)
+            }
+            1 => {
+                // 16 bit quantization tables
+                let mut qt_values = [0_u16; 64];
+
+                for i in 0..64 {
+                    qt_values[i] = img.stream.get_u16_be_err()?;
+                }
+                qt_length -= (precision_value as u16) + 1;
+
+                un_zig_zag(&qt_values)
+            }
+            _ => {
+                return Err(DecodeErrors::DqtError(format!(
+                    "Expected QT precision value of either 0 or 1, found {precision:?}"
+                )));
+            }
+        };
+
+        if table_position >= MAX_COMPONENTS {
+            return Err(DecodeErrors::DqtError(format!(
+                "Too large table position for QT :{table_position}, expected between 0 and 3"
+            )));
+        }
+
+        img.qt_tables[table_position] = Some(dct_table);
+    }
+
+    return Ok(());
+}
+
+/// Section:`B.2.2 Frame header syntax`
+
+pub(crate) fn parse_start_of_frame<T: ZByteReaderTrait>(
+    sof: SOFMarkers, img: &mut JpegDecoder<T>
+) -> Result<(), DecodeErrors> {
+    if img.seen_sof {
+        return Err(DecodeErrors::SofError(
+            "Two Start of Frame Markers".to_string()
+        ));
+    }
+    // Get length of the frame header
+    let length = img.stream.get_u16_be_err()?;
+    // usually 8, but can be 12 and 16, we currently support only 8
+    // so sorry about that 12 bit images
+    let dt_precision = img.stream.read_u8_err()?;
+
+    if dt_precision != 8 {
+        return Err(DecodeErrors::SofError(format!(
+            "The library can only parse 8-bit images, the image has {dt_precision} bits of precision"
+        )));
+    }
+
+    img.info.set_density(dt_precision);
+
+    // read  and set the image height.
+    let img_height = img.stream.get_u16_be_err()?;
+    img.info.set_height(img_height);
+
+    // read and set the image width
+    let img_width = img.stream.get_u16_be_err()?;
+    img.info.set_width(img_width);
+
+    trace!("Image width  :{}", img_width);
+    trace!("Image height :{}", img_height);
+
+    if usize::from(img_width) > img.options.max_width() {
+        return Err(DecodeErrors::Format(format!("Image width {} greater than width limit {}. If use `set_limits` if you want to support huge images", img_width, img.options.max_width())));
+    }
+
+    if usize::from(img_height) > img.options.max_height() {
+        return Err(DecodeErrors::Format(format!("Image height {} greater than height limit {}. If use `set_limits` if you want to support huge images", img_height, img.options.max_height())));
+    }
+
+    // Check image width or height is zero
+    if img_width == 0 || img_height == 0 {
+        return Err(DecodeErrors::ZeroError);
+    }
+
+    // Number of components for the image.
+    let num_components = img.stream.read_u8_err()?;
+
+    if num_components == 0 {
+        return Err(DecodeErrors::SofError(
+            "Number of components cannot be zero.".to_string()
+        ));
+    }
+
+    let expected = 8 + 3 * u16::from(num_components);
+    // length should be equal to num components
+    if length != expected {
+        return Err(DecodeErrors::SofError(format!(
+            "Length of start of frame differs from expected {expected},value is {length}"
+        )));
+    }
+
+    trace!("Image components : {}", num_components);
+
+    if num_components == 1 {
+        // SOF sets the number of image components
+        // and that to us translates to setting input and output
+        // colorspaces to zero
+        img.input_colorspace = ColorSpace::Luma;
+        img.options = img.options.jpeg_set_out_colorspace(ColorSpace::Luma);
+        debug!("Overriding default colorspace set to Luma");
+    }
+    if num_components == 4 && img.input_colorspace == ColorSpace::YCbCr {
+        trace!("Input image has 4 components, defaulting to CMYK colorspace");
+        // https://entropymine.wordpress.com/2018/10/22/how-is-a-jpeg-images-color-type-determined/
+        img.input_colorspace = ColorSpace::CMYK;
+    }
+
+    // set number of components
+    img.info.components = num_components;
+
+    let mut components = Vec::with_capacity(num_components as usize);
+    let mut temp = [0; 3];
+
+    for pos in 0..num_components {
+        // read 3 bytes for each component
+        img.stream.read_exact_bytes(&mut temp)?;
+
+        // create a component.
+        let component = Components::from(temp, pos)?;
+
+        components.push(component);
+    }
+    img.seen_sof = true;
+
+    img.info.set_sof_marker(sof);
+
+    img.components = components;
+
+    Ok(())
+}
+
+/// Parse a start of scan data
+pub(crate) fn parse_sos<T: ZByteReaderTrait>(
+    image: &mut JpegDecoder<T>
+) -> Result<(), DecodeErrors> {
+    // Scan header length
+    let ls = image.stream.get_u16_be_err()?;
+    // Number of image components in scan
+    let ns = image.stream.read_u8_err()?;
+
+    let mut seen = [-1; { MAX_COMPONENTS + 1 }];
+
+    image.num_scans = ns;
+
+    if ls != 6 + 2 * u16::from(ns) {
+        return Err(DecodeErrors::SosError(format!(
+            "Bad SOS length {ls},corrupt jpeg"
+        )));
+    }
+
+    // Check number of components.
+    if !(1..5).contains(&ns) {
+        return Err(DecodeErrors::SosError(format!(
+            "Number of components in start of scan should be less than 3 but more than 0. Found {ns}"
+        )));
+    }
+
+    if image.info.components == 0 {
+        return Err(DecodeErrors::FormatStatic(
+            "Error decoding SOF Marker, Number of components cannot be zero."
+        ));
+    }
+
+    // consume spec parameters
+    for i in 0..ns {
+        // CS_i parameter, I don't need it so I might as well delete it
+        let id = image.stream.read_u8_err()?;
+
+        if seen.contains(&i32::from(id)) {
+            return Err(DecodeErrors::SofError(format!(
+                "Duplicate ID {id} seen twice in the same component"
+            )));
+        }
+
+        seen[usize::from(i)] = i32::from(id);
+        // DC and AC huffman table position
+        // top 4 bits contain dc huffman destination table
+        // lower four bits contain ac huffman destination table
+        let y = image.stream.read_u8_err()?;
+
+        let mut j = 0;
+
+        while j < image.info.components {
+            if image.components[j as usize].id == id {
+                break;
+            }
+
+            j += 1;
+        }
+
+        if j == image.info.components {
+            return Err(DecodeErrors::SofError(format!(
+                "Invalid component id {}, expected a value between 0 and {}",
+                id,
+                image.components.len()
+            )));
+        }
+
+        image.components[usize::from(j)].dc_huff_table = usize::from((y >> 4) & 0xF);
+        image.components[usize::from(j)].ac_huff_table = usize::from(y & 0xF);
+        image.z_order[i as usize] = j as usize;
+    }
+
+    // Collect the component spec parameters
+    // This is only needed for progressive images but I'll read
+    // them in order to ensure they are correct according to the spec
+
+    // Extract progressive information
+
+    // https://www.w3.org/Graphics/JPEG/itu-t81.pdf
+    // Page 42
+
+    // Start of spectral / predictor selection. (between 0 and 63)
+    image.spec_start = image.stream.read_u8_err()?;
+    // End of spectral selection
+    image.spec_end = image.stream.read_u8_err()?;
+
+    let bit_approx = image.stream.read_u8_err()?;
+    // successive approximation bit position high
+    image.succ_high = bit_approx >> 4;
+
+    if image.spec_end > 63 {
+        return Err(DecodeErrors::SosError(format!(
+            "Invalid Se parameter {}, range should be 0-63",
+            image.spec_end
+        )));
+    }
+    if image.spec_start > 63 {
+        return Err(DecodeErrors::SosError(format!(
+            "Invalid Ss parameter {}, range should be 0-63",
+            image.spec_start
+        )));
+    }
+    if image.succ_high > 13 {
+        return Err(DecodeErrors::SosError(format!(
+            "Invalid Ah parameter {}, range should be 0-13",
+            image.succ_low
+        )));
+    }
+    // successive approximation bit position low
+    image.succ_low = bit_approx & 0xF;
+
+    if image.succ_low > 13 {
+        return Err(DecodeErrors::SosError(format!(
+            "Invalid Al parameter {}, range should be 0-13",
+            image.succ_low
+        )));
+    }
+
+    trace!(
+        "Ss={}, Se={} Ah={} Al={}",
+        image.spec_start,
+        image.spec_end,
+        image.succ_high,
+        image.succ_low
+    );
+
+    Ok(())
+}
+
+/// Parse Adobe App14 segment
+pub(crate) fn parse_app14<T: ZByteReaderTrait>(
+    decoder: &mut JpegDecoder<T>
+) -> Result<(), DecodeErrors> {
+    // skip length
+    let mut length = usize::from(decoder.stream.get_u16_be());
+
+    if length < 2 {
+        return Err(DecodeErrors::FormatStatic("Too small APP14 length"));
+    }
+    if length < 14 {
+        return Err(DecodeErrors::FormatStatic(
+            "Too short of a length for App14 segment"
+        ));
+    }
+    if decoder.stream.peek_at(0, 5)? == b"Adobe" {
+        // move stream 6 bytes to remove adobe id
+        decoder.stream.skip(6)?;
+        // skip version, flags0 and flags1
+        decoder.stream.skip(5)?;
+        // get color transform
+        let transform = decoder.stream.read_u8();
+        // https://exiftool.org/TagNames/JPEG.html#Adobe
+        match transform {
+            0 => decoder.input_colorspace = ColorSpace::CMYK,
+            1 => decoder.input_colorspace = ColorSpace::YCbCr,
+            2 => decoder.input_colorspace = ColorSpace::YCCK,
+            _ => {
+                return Err(DecodeErrors::Format(format!(
+                    "Unknown Adobe colorspace {transform}"
+                )))
+            }
+        }
+        // length   = 2
+        // adobe id = 6
+        // version =  5
+        // transform = 1
+        length = length.saturating_sub(14);
+    } else if decoder.options.strict_mode() {
+        return Err(DecodeErrors::FormatStatic("Corrupt Adobe App14 segment"));
+    } else {
+        length = length.saturating_sub(2);
+        error!("Not a valid Adobe APP14 Segment");
+    }
+    // skip any proceeding lengths.
+    // we do not need them
+    decoder.stream.skip(length)?;
+
+    Ok(())
+}
+
+/// Parse the APP1 segment
+///
+/// This contains the exif tag
+pub(crate) fn parse_app1<T: ZByteReaderTrait>(
+    decoder: &mut JpegDecoder<T>
+) -> Result<(), DecodeErrors> {
+    // contains exif data
+    let mut length = usize::from(decoder.stream.get_u16_be());
+
+    if length < 2 {
+        return Err(DecodeErrors::FormatStatic("Too small app1 length"));
+    }
+    // length bytes
+    length -= 2;
+
+    if length > 6 && decoder.stream.peek_at(0, 6)? == b"Exif\x00\x00" {
+        trace!("Exif segment present");
+        // skip bytes we read above
+        decoder.stream.skip(6)?;
+        length -= 6;
+
+        let exif_bytes = decoder.stream.peek_at(0, length)?.to_vec();
+
+        decoder.exif_data = Some(exif_bytes);
+    } else {
+        warn!("Wrongly formatted exif tag");
+    }
+
+    decoder.stream.skip(length)?;
+    Ok(())
+}
+
+pub(crate) fn parse_app2<T: ZByteReaderTrait>(
+    decoder: &mut JpegDecoder<T>
+) -> Result<(), DecodeErrors> {
+    let mut length = usize::from(decoder.stream.get_u16_be());
+
+    if length < 2 {
+        return Err(DecodeErrors::FormatStatic("Too small app2 segment"));
+    }
+    // length bytes
+    length -= 2;
+
+    if length > 14 && decoder.stream.peek_at(0, 12)? == *b"ICC_PROFILE\0" {
+        trace!("ICC Profile present");
+        // skip 12 bytes which indicate ICC profile
+        length -= 12;
+        decoder.stream.skip(12)?;
+        let seq_no = decoder.stream.read_u8();
+        let num_markers = decoder.stream.read_u8();
+        // deduct the two bytes we read above
+        length -= 2;
+
+        let data = decoder.stream.peek_at(0, length)?.to_vec();
+
+        let icc_chunk = ICCChunk {
+            seq_no,
+            num_markers,
+            data
+        };
+        decoder.icc_data.push(icc_chunk);
+    }
+
+    decoder.stream.skip(length)?;
+
+    Ok(())
+}
+
+/// Small utility function to print Un-zig-zagged quantization tables
+
+fn un_zig_zag<T>(a: &[T]) -> [i32; 64]
+where
+    T: Default + Copy,
+    i32: core::convert::From<T>
+{
+    let mut output = [i32::default(); 64];
+
+    for i in 0..64 {
+        output[UN_ZIGZAG[i]] = i32::from(a[i]);
+    }
+
+    output
+}
--- a/third_party/zune-jpeg/src/huffman.rs
+++ b/third_party/zune-jpeg/src/huffman.rs
@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+//! This file contains a single struct `HuffmanTable` that
+//! stores Huffman tables needed during `BitStream` decoding.
+#![allow(clippy::similar_names, clippy::module_name_repetitions)]
+
+use alloc::string::ToString;
+
+use crate::errors::DecodeErrors;
+
+/// Determines how many bits of lookahead we have for our bitstream decoder.
+
+pub const HUFF_LOOKAHEAD: u8 = 9;
+
+/// A struct which contains necessary tables for decoding a JPEG
+/// huffman encoded bitstream
+
+pub struct HuffmanTable {
+    // element `[0]` of each array is unused
+    /// largest code of length k
+    pub(crate) maxcode: [i32; 18],
+    /// offset for codes of length k
+    /// Answers the question, where do code-lengths of length k end
+    /// Element 0 is unused
+    pub(crate) offset:  [i32; 18],
+    /// lookup table for fast decoding
+    ///
+    /// top  bits above HUFF_LOOKAHEAD contain the code length.
+    ///
+    /// Lower (8) bits contain the symbol in order of increasing code length.
+    pub(crate) lookup:  [i32; 1 << HUFF_LOOKAHEAD],
+
+    /// A table which can be used to decode small AC coefficients and
+    /// do an equivalent of receive_extend
+    pub(crate) ac_lookup: Option<[i16; 1 << HUFF_LOOKAHEAD]>,
+
+    /// Directly represent contents of a JPEG DHT marker
+    ///
+    /// \# number of symbols with codes of length `k` bits
+    // bits[0] is unused
+    /// Symbols in order of increasing code length
+    pub(crate) values: [u8; 256]
+}
+
+impl HuffmanTable {
+    pub fn new(
+        codes: &[u8; 17], values: [u8; 256], is_dc: bool, is_progressive: bool
+    ) -> Result<HuffmanTable, DecodeErrors> {
+        let too_long_code = (i32::from(HUFF_LOOKAHEAD) + 1) << HUFF_LOOKAHEAD;
+        let mut p = HuffmanTable {
+            maxcode: [0; 18],
+            offset: [0; 18],
+            lookup: [too_long_code; 1 << HUFF_LOOKAHEAD],
+            values,
+            ac_lookup: None
+        };
+
+        p.make_derived_table(is_dc, is_progressive, codes)?;
+
+        Ok(p)
+    }
+
+    /// Create a new huffman tables with values that aren't fixed
+    /// used by fill_mjpeg_tables
+    pub fn new_unfilled(
+        codes: &[u8; 17], values: &[u8], is_dc: bool, is_progressive: bool
+    ) -> Result<HuffmanTable, DecodeErrors> {
+        let mut buf = [0; 256];
+        buf[..values.len()].copy_from_slice(values);
+        HuffmanTable::new(codes, buf, is_dc, is_progressive)
+    }
+
+    /// Compute derived values for a Huffman table
+    ///
+    /// This routine performs some validation checks on the table
+    #[allow(
+        clippy::cast_possible_truncation,
+        clippy::cast_possible_wrap,
+        clippy::cast_sign_loss,
+        clippy::too_many_lines,
+        clippy::needless_range_loop
+    )]
+    fn make_derived_table(
+        &mut self, is_dc: bool, _is_progressive: bool, bits: &[u8; 17]
+    ) -> Result<(), DecodeErrors> {
+        // build a list of code size
+        let mut huff_size = [0; 257];
+        // Huffman code lengths
+        let mut huff_code: [u32; 257] = [0; 257];
+        // figure C.1 make table of Huffman code length for each symbol
+        let mut p = 0;
+
+        for l in 1..=16 {
+            let mut i = i32::from(bits[l]);
+            // table overrun is checked before ,so we dont need to check
+            while i != 0 {
+                huff_size[p] = l as u8;
+                p += 1;
+                i -= 1;
+            }
+        }
+
+        huff_size[p] = 0;
+
+        let num_symbols = p;
+        // Generate the codes themselves
+        // We also validate that the counts represent a legal Huffman code tree
+        let mut code = 0;
+        let mut si = i32::from(huff_size[0]);
+
+        p = 0;
+
+        while huff_size[p] != 0 {
+            while i32::from(huff_size[p]) == si {
+                huff_code[p] = code;
+                code += 1;
+                p += 1;
+            }
+            // maximum code of length si, pre-shifted by 16-k bits
+            self.maxcode[si as usize] = (code << (16 - si)) as i32;
+            // code is now 1 more than the last code used for code-length si; but
+            // it must still fit in si bits, since no code is allowed to be all ones.
+            if (code as i32) >= (1 << si) {
+                return Err(DecodeErrors::HuffmanDecode("Bad Huffman Table".to_string()));
+            }
+
+            code <<= 1;
+            si += 1;
+        }
+
+        // Figure F.15 generate decoding tables for bit-sequential decoding
+        p = 0;
+
+        for l in 0..=16 {
+            if bits[l] == 0 {
+                // -1 if no codes of this length
+                self.maxcode[l] = -1;
+            } else {
+                // offset[l]=codes[index of 1st symbol of code length l
+                // minus minimum code of length l]
+                self.offset[l] = (p as i32) - (huff_code[p]) as i32;
+                p += usize::from(bits[l]);
+            }
+        }
+
+        self.offset[17] = 0;
+        // we ensure that decode terminates
+        self.maxcode[17] = 0x000F_FFFF;
+
+        /*
+         * Compute lookahead tables to speed up decoding.
+         * First we set all the table entries to 0(left justified), indicating "too long";
+         * (Note too long was set during initialization)
+         * then we iterate through the Huffman codes that are short enough and
+         * fill in all the entries that correspond to bit sequences starting
+         * with that code.
+         */
+
+        p = 0;
+
+        for l in 1..=HUFF_LOOKAHEAD {
+            for _ in 1..=i32::from(bits[usize::from(l)]) {
+                // l -> Current code length,
+                // p => Its index in self.code and self.values
+                // Generate left justified code followed by all possible bit sequences
+                let mut look_bits = (huff_code[p] as usize) << (HUFF_LOOKAHEAD - l);
+
+                for _ in 0..1 << (HUFF_LOOKAHEAD - l) {
+                    self.lookup[look_bits] =
+                        (i32::from(l) << HUFF_LOOKAHEAD) | i32::from(self.values[p]);
+                    look_bits += 1;
+                }
+
+                p += 1;
+            }
+        }
+        // build an ac table that does an equivalent of decode and receive_extend
+        if !is_dc {
+            let mut fast = [255; 1 << HUFF_LOOKAHEAD];
+            // Iterate over number of symbols
+            for i in 0..num_symbols {
+                // get code size for an item
+                let s = huff_size[i];
+
+                if s <= HUFF_LOOKAHEAD {
+                    // if it's lower than what we need for our lookup table create the table
+                    let c = (huff_code[i] << (HUFF_LOOKAHEAD - s)) as usize;
+                    let m = (1 << (HUFF_LOOKAHEAD - s)) as usize;
+
+                    for j in 0..m {
+                        fast[c + j] = i as i16;
+                    }
+                }
+            }
+
+            // build a table that decodes both magnitude and value of small ACs in
+            // one go.
+            let mut fast_ac = [0; 1 << HUFF_LOOKAHEAD];
+
+            for i in 0..(1 << HUFF_LOOKAHEAD) {
+                let fast_v = fast[i];
+
+                if fast_v < 255 {
+                    // get symbol value from AC table
+                    let rs = self.values[fast_v as usize];
+                    // shift by 4 to get run length
+                    let run = i16::from((rs >> 4) & 15);
+                    // get magnitude bits stored at the lower 3 bits
+                    let mag_bits = i16::from(rs & 15);
+                    // length of the bit we've read
+                    let len = i16::from(huff_size[fast_v as usize]);
+
+                    if mag_bits != 0 && (len + mag_bits) <= i16::from(HUFF_LOOKAHEAD) {
+                        // magnitude code followed by receive_extend code
+                        let mut k = (((i as i16) << len) & ((1 << HUFF_LOOKAHEAD) - 1))
+                            >> (i16::from(HUFF_LOOKAHEAD) - mag_bits);
+                        let m = 1 << (mag_bits - 1);
+
+                        if k < m {
+                            k += (!0_i16 << mag_bits) + 1;
+                        };
+
+                        // if result is small enough fit into fast ac table
+                        if (-128..=127).contains(&k) {
+                            fast_ac[i] = (k << 8) + (run << 4) + (len + mag_bits);
+                        }
+                    }
+                }
+            }
+            self.ac_lookup = Some(fast_ac);
+        }
+
+        // Validate symbols as being reasonable
+        // For AC tables, we make no check, but accept all byte values 0..255
+        // For DC tables, we require symbols to be in range 0..15
+        if is_dc {
+            for i in 0..num_symbols {
+                let sym = self.values[i];
+
+                if sym > 15 {
+                    return Err(DecodeErrors::HuffmanDecode("Bad Huffman Table".to_string()));
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
--- a/third_party/zune-jpeg/src/idct.rs
+++ b/third_party/zune-jpeg/src/idct.rs
@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+//! Routines for IDCT
+//!
+//! Essentially we provide 2 routines for IDCT, a scalar implementation and a not super optimized
+//! AVX2 one, i'll talk about them here.
+//!
+//! There are 2 reasons why we have the avx one
+//! 1. No one compiles with -C target-features=avx2 hence binaries won't probably take advantage(even
+//! if it exists).
+//! 2. AVX employs zero short circuit in a way the scalar code cannot employ it.
+//!     - AVX does this by checking for MCU's whose 63 AC coefficients are zero and if true, it writes
+//!        values directly, if false, it goes the long way of calculating.
+//!     -   Although this can be trivially implemented in the scalar version, it  generates code
+//!         I'm not happy width(scalar version that basically loops and that is too many branches for me)
+//!         The avx one does a better job of using bitwise or's with (`_mm256_or_si256`) which is magnitudes of faster
+//!         than anything I could come up with
+//!
+//! The AVX code also has some cool transpose_u16 instructions which look so complicated to be cool
+//! (spoiler alert, i barely understand how it works, that's why I credited the owner).
+//!
+#![allow(
+    clippy::excessive_precision,
+    clippy::unreadable_literal,
+    clippy::module_name_repetitions,
+    unused_parens,
+    clippy::wildcard_imports
+)]
+
+use zune_core::log::debug;
+use zune_core::options::DecoderOptions;
+
+use crate::decoder::IDCTPtr;
+use crate::idct::scalar::idct_int;
+
+#[cfg(feature = "x86")]
+pub mod avx2;
+#[cfg(feature = "neon")]
+pub mod neon;
+
+pub mod scalar;
+
+/// Choose an appropriate IDCT function
+#[allow(unused_variables)]
+pub fn choose_idct_func(options: &DecoderOptions) -> IDCTPtr {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[cfg(feature = "x86")]
+    {
+        if options.use_avx2() {
+            debug!("Using vector integer IDCT");
+            // use avx one
+            return crate::idct::avx2::idct_avx2;
+        }
+    }
+    #[cfg(target_arch = "aarch64")]
+    #[cfg(feature = "neon")]
+    {
+        if options.use_neon() {
+            debug!("Using vector integer IDCT");
+            return crate::idct::neon::idct_neon;
+        }
+    }
+    debug!("Using scalar integer IDCT");
+    // use generic one
+    return idct_int;
+}
+
+#[cfg(test)]
+#[allow(unreachable_code)]
+#[allow(dead_code)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn idct_test0() {
+        let stride = 8;
+        let mut coeff = [10; 64];
+        let mut coeff2 = [10; 64];
+        let mut output_scalar = [0; 64];
+        let mut output_vector = [0; 64];
+        idct_fnc()(&mut coeff, &mut output_vector, stride);
+        idct_int(&mut coeff2, &mut output_scalar, stride);
+        assert_eq!(output_scalar, output_vector, "IDCT and scalar do not match");
+    }
+
+    #[test]
+    fn do_idct_test1() {
+        let stride = 8;
+        let mut coeff = [14; 64];
+        let mut coeff2 = [14; 64];
+        let mut output_scalar = [0; 64];
+        let mut output_vector = [0; 64];
+        idct_fnc()(&mut coeff, &mut output_vector, stride);
+        idct_int(&mut coeff2, &mut output_scalar, stride);
+        assert_eq!(output_scalar, output_vector, "IDCT and scalar do not match");
+    }
+
+    #[test]
+    fn do_idct_test2() {
+        let stride = 8;
+        let mut coeff = [0; 64];
+        coeff[0] = 255;
+        coeff[63] = -256;
+        let mut coeff2 = coeff;
+        let mut output_scalar = [0; 64];
+        let mut output_vector = [0; 64];
+        idct_fnc()(&mut coeff, &mut output_vector, stride);
+        idct_int(&mut coeff2, &mut output_scalar, stride);
+        assert_eq!(output_scalar, output_vector, "IDCT and scalar do not match");
+    }
+
+    #[test]
+    fn do_idct_zeros() {
+        let stride = 8;
+        let mut coeff = [0; 64];
+        let mut coeff2 = [0; 64];
+        let mut output_scalar = [0; 64];
+        let mut output_vector = [0; 64];
+        idct_fnc()(&mut coeff, &mut output_vector, stride);
+        idct_int(&mut coeff2, &mut output_scalar, stride);
+        assert_eq!(output_scalar, output_vector, "IDCT and scalar do not match");
+    }
+
+    fn idct_fnc() -> IDCTPtr {
+        #[cfg(feature = "neon")]
+        #[cfg(target_arch = "aarch64")]
+        {
+            use crate::idct::neon::idct_neon;
+            return idct_neon;
+        }
+
+        #[cfg(feature = "x86")]
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        {
+            use crate::idct::avx2::idct_avx2;
+            return idct_avx2;
+        }
+
+        idct_int
+    }
+}
--- a/third_party/zune-jpeg/src/idct/avx2.rs
+++ b/third_party/zune-jpeg/src/idct/avx2.rs
@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+//! AVX optimised IDCT.
+//!
+//! Okay not thaat optimised.
+//!
+//!
+//! # The implementation
+//! The implementation is neatly broken down into two operations.
+//!
+//! 1. Test for zeroes
+//! > There is a shortcut method for idct  where when all AC values are zero, we can get the answer really quickly.
+//!  by scaling the 1/8th of the DCT coefficient of the block to the whole block and level shifting.
+//!
+//! 2. If above fails, we proceed to carry out IDCT as a two pass one dimensional algorithm.
+//! IT does two whole scans where it carries out IDCT on all items
+//! After each successive scan, data is transposed in register(thank you x86 SIMD powers). and the second
+//! pass is carried out.
+//!
+//! The code is not super optimized, it produces bit identical results with scalar code hence it's
+//! `mm256_add_epi16`
+//! and it also has the advantage of making this implementation easy to maintain.
+
+#![cfg(feature = "x86")]
+#![allow(dead_code)]
+
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+
+use crate::unsafe_utils::{transpose, YmmRegister};
+
+const SCALE_BITS: i32 = 512 + 65536 + (128 << 17);
+
+/// SAFETY
+/// ------
+///
+/// It is the responsibility of the CALLER to ensure that  this function is
+/// called in contexts where the CPU supports it
+///
+///
+/// For documentation see module docs.
+
+pub fn idct_avx2(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize) {
+    unsafe {
+        // We don't call this method directly because we need to flag the code function
+        // with #[target_feature] so that the compiler does do weird stuff with
+        // it
+        idct_int_avx2_inner(in_vector, out_vector, stride);
+    }
+}
+
+#[target_feature(enable = "avx2")]
+#[allow(
+    clippy::too_many_lines,
+    clippy::cast_possible_truncation,
+    clippy::similar_names,
+    clippy::op_ref,
+    unused_assignments,
+    clippy::zero_prefixed_literal
+)]
+pub unsafe fn idct_int_avx2_inner(
+    in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize
+) {
+    let mut pos = 0;
+
+    // load into registers
+    //
+    // We sign extend i16's to i32's and calculate them with extended precision and
+    // later reduce them to i16's when we are done carrying out IDCT
+
+    let rw0 = _mm256_loadu_si256(in_vector[00..].as_ptr().cast());
+    let rw1 = _mm256_loadu_si256(in_vector[08..].as_ptr().cast());
+    let rw2 = _mm256_loadu_si256(in_vector[16..].as_ptr().cast());
+    let rw3 = _mm256_loadu_si256(in_vector[24..].as_ptr().cast());
+    let rw4 = _mm256_loadu_si256(in_vector[32..].as_ptr().cast());
+    let rw5 = _mm256_loadu_si256(in_vector[40..].as_ptr().cast());
+    let rw6 = _mm256_loadu_si256(in_vector[48..].as_ptr().cast());
+    let rw7 = _mm256_loadu_si256(in_vector[56..].as_ptr().cast());
+
+    // Forward DCT and quantization may cause all the AC terms to be zero, for such
+    // cases we can try to accelerate it
+
+    // Basically the poop is that whenever the array has 63 zeroes, its idct is
+    // (arr[0]>>3)or (arr[0]/8) propagated to all the elements.
+    // We first test to see if the array contains zero elements and if it does, we go the
+    // short way.
+    //
+    // This reduces IDCT overhead from about 39% to 18 %, almost half
+
+    // Do another load for the first row, we don't want to check DC value, because
+    // we only care about AC terms
+    let rw8 = _mm256_loadu_si256(in_vector[1..].as_ptr().cast());
+
+    let zero = _mm256_setzero_si256();
+
+    let mut non_zero = 0;
+
+    non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi32(rw8, zero));
+    non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi32(rw1, zero));
+    non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi32(rw2, zero));
+    non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw3, zero));
+
+    non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw4, zero));
+    non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw5, zero));
+    non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw6, zero));
+    non_zero += _mm256_movemask_epi8(_mm256_cmpeq_epi64(rw7, zero));
+
+    if non_zero == -8 {
+        // AC terms all zero, idct of the block is  is ( coeff[0] * qt[0] )/8 + 128 (bias)
+        // (and clamped to 255)
+        let idct_value = _mm_set1_epi16(((in_vector[0] >> 3) + 128).clamp(0, 255) as i16);
+
+        macro_rules! store {
+            ($pos:tt,$value:tt) => {
+                // store
+                _mm_storeu_si128(
+                    out_vector
+                        .get_mut($pos..$pos + 8)
+                        .unwrap()
+                        .as_mut_ptr()
+                        .cast(),
+                    $value
+                );
+                $pos += stride;
+            };
+        }
+        store!(pos, idct_value);
+        store!(pos, idct_value);
+        store!(pos, idct_value);
+        store!(pos, idct_value);
+
+        store!(pos, idct_value);
+        store!(pos, idct_value);
+        store!(pos, idct_value);
+        store!(pos, idct_value);
+
+        return;
+    }
+
+    let mut row0 = YmmRegister { mm256: rw0 };
+    let mut row1 = YmmRegister { mm256: rw1 };
+    let mut row2 = YmmRegister { mm256: rw2 };
+    let mut row3 = YmmRegister { mm256: rw3 };
+
+    let mut row4 = YmmRegister { mm256: rw4 };
+    let mut row5 = YmmRegister { mm256: rw5 };
+    let mut row6 = YmmRegister { mm256: rw6 };
+    let mut row7 = YmmRegister { mm256: rw7 };
+
+    macro_rules! dct_pass {
+        ($SCALE_BITS:tt,$scale:tt) => {
+            // There are a lot of ways to do this
+            // but to keep it simple(and beautiful), ill make a direct translation of the
+            // scalar code to also make this code fully transparent(this version and the non
+            // avx one should produce identical code.)
+
+            // even part
+            let p1 = (row2 + row6) * 2217;
+
+            let mut t2 = p1 + row6 * -7567;
+            let mut t3 = p1 + row2 * 3135;
+
+            let mut t0 = YmmRegister {
+                mm256: _mm256_slli_epi32((row0 + row4).mm256, 12)
+            };
+            let mut t1 = YmmRegister {
+                mm256: _mm256_slli_epi32((row0 - row4).mm256, 12)
+            };
+
+            let x0 = t0 + t3 + $SCALE_BITS;
+            let x3 = t0 - t3 + $SCALE_BITS;
+            let x1 = t1 + t2 + $SCALE_BITS;
+            let x2 = t1 - t2 + $SCALE_BITS;
+
+            let p3 = row7 + row3;
+            let p4 = row5 + row1;
+            let p1 = row7 + row1;
+            let p2 = row5 + row3;
+            let p5 = (p3 + p4) * 4816;
+
+            t0 = row7 * 1223;
+            t1 = row5 * 8410;
+            t2 = row3 * 12586;
+            t3 = row1 * 6149;
+
+            let p1 = p5 + p1 * -3685;
+            let p2 = p5 + (p2 * -10497);
+            let p3 = p3 * -8034;
+            let p4 = p4 * -1597;
+
+            t3 += p1 + p4;
+            t2 += p2 + p3;
+            t1 += p2 + p4;
+            t0 += p1 + p3;
+
+            row0.mm256 = _mm256_srai_epi32((x0 + t3).mm256, $scale);
+            row1.mm256 = _mm256_srai_epi32((x1 + t2).mm256, $scale);
+            row2.mm256 = _mm256_srai_epi32((x2 + t1).mm256, $scale);
+            row3.mm256 = _mm256_srai_epi32((x3 + t0).mm256, $scale);
+
+            row4.mm256 = _mm256_srai_epi32((x3 - t0).mm256, $scale);
+            row5.mm256 = _mm256_srai_epi32((x2 - t1).mm256, $scale);
+            row6.mm256 = _mm256_srai_epi32((x1 - t2).mm256, $scale);
+            row7.mm256 = _mm256_srai_epi32((x0 - t3).mm256, $scale);
+        };
+    }
+
+    // Process rows
+    dct_pass!(512, 10);
+    transpose(
+        &mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6, &mut row7
+    );
+
+    // process columns
+    dct_pass!(SCALE_BITS, 17);
+    transpose(
+        &mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6, &mut row7
+    );
+
+    // Pack i32 to i16's,
+    // clamp them to be between 0-255
+    // Undo shuffling
+    // Store back to array
+    macro_rules! permute_store {
+        ($x:tt,$y:tt,$index:tt,$out:tt) => {
+            let a = _mm256_packs_epi32($x, $y);
+
+            // Clamp the values after packing, we can clamp more values at once
+            let b = clamp_avx(a);
+
+            // /Undo shuffling
+            let c = _mm256_permute4x64_epi64(b, shuffle(3, 1, 2, 0));
+
+            // store first vector
+            _mm_storeu_si128(
+                ($out)
+                    .get_mut($index..$index + 8)
+                    .unwrap()
+                    .as_mut_ptr()
+                    .cast(),
+                _mm256_extractf128_si256::<0>(c)
+            );
+            $index += stride;
+            // second vector
+            _mm_storeu_si128(
+                ($out)
+                    .get_mut($index..$index + 8)
+                    .unwrap()
+                    .as_mut_ptr()
+                    .cast(),
+                _mm256_extractf128_si256::<1>(c)
+            );
+            $index += stride;
+        };
+    }
+    // Pack and write the values back to the array
+    permute_store!((row0.mm256), (row1.mm256), pos, out_vector);
+    permute_store!((row2.mm256), (row3.mm256), pos, out_vector);
+    permute_store!((row4.mm256), (row5.mm256), pos, out_vector);
+    permute_store!((row6.mm256), (row7.mm256), pos, out_vector);
+}
+
+#[inline]
+#[target_feature(enable = "avx2")]
+unsafe fn clamp_avx(reg: __m256i) -> __m256i {
+    let min_s = _mm256_set1_epi16(0);
+    let max_s = _mm256_set1_epi16(255);
+
+    let max_v = _mm256_max_epi16(reg, min_s); //max(a,0)
+    let min_v = _mm256_min_epi16(max_v, max_s); //min(max(a,0),255)
+    return min_v;
+}
+
+/// A copy of `_MM_SHUFFLE()` that doesn't require
+/// a nightly compiler
+#[inline]
+const fn shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
+    ((z << 6) | (y << 4) | (x << 2) | w)
+}
--- a/third_party/zune-jpeg/src/idct/neon.rs
+++ b/third_party/zune-jpeg/src/idct/neon.rs
@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+#![cfg(target_arch = "aarch64")]
+//! AVX optimised IDCT.
+//!
+//! Okay not thaat optimised.
+//!
+//!
+//! # The implementation
+//! The implementation is neatly broken down into two operations.
+//!
+//! 1. Test for zeroes
+//! > There is a shortcut method for idct  where when all AC values are zero, we can get the answer really quickly.
+//!  by scaling the 1/8th of the DCT coefficient of the block to the whole block and level shifting.
+//!
+//! 2. If above fails, we proceed to carry out IDCT as a two pass one dimensional algorithm.
+//! IT does two whole scans where it carries out IDCT on all items
+//! After each successive scan, data is transposed in register(thank you x86 SIMD powers). and the second
+//! pass is carried out.
+//!
+//! The code is not super optimized, it produces bit identical results with scalar code hence it's
+//! `mm256_add_epi16`
+//! and it also has the advantage of making this implementation easy to maintain.
+
+#![cfg(feature = "neon")]
+
+use core::arch::aarch64::*;
+
+use crate::unsafe_utils::{transpose, YmmRegister};
+
+const SCALE_BITS: i32 = 512 + 65536 + (128 << 17);
+
+/// SAFETY
+/// ------
+///
+/// It is the responsibility of the CALLER to ensure that  this function is
+/// called in contexts where the CPU supports it
+///
+///
+/// For documentation see module docs.
+
+pub fn idct_neon(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize) {
+    unsafe {
+        // We don't call this method directly because we need to flag the code function
+        // with #[target_feature] so that the compiler does do weird stuff with
+        // it
+        idct_int_neon_inner(in_vector, out_vector, stride);
+    }
+}
+
+#[inline]
+#[target_feature(enable = "neon")]
+unsafe fn pack_16(a: int32x4x2_t) -> int16x8_t {
+    vcombine_s16(vqmovn_s32(a.0), vqmovn_s32(a.1))
+}
+
+#[inline]
+#[target_feature(enable = "neon")]
+unsafe fn condense_bottom_16(a: int32x4x2_t, b: int32x4x2_t) -> int16x8x2_t {
+    int16x8x2_t(pack_16(a), pack_16(b))
+}
+
+#[target_feature(enable = "neon")]
+#[allow(
+    clippy::too_many_lines,
+    clippy::cast_possible_truncation,
+    clippy::similar_names,
+    clippy::op_ref,
+    unused_assignments,
+    clippy::zero_prefixed_literal
+)]
+pub unsafe fn idct_int_neon_inner(
+    in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize
+) {
+    let mut pos = 0;
+
+    // load into registers
+    //
+    // We sign extend i16's to i32's and calculate them with extended precision and
+    // later reduce them to i16's when we are done carrying out IDCT
+
+    let mut row0 = YmmRegister::load(in_vector[00..].as_ptr().cast());
+    let mut row1 = YmmRegister::load(in_vector[08..].as_ptr().cast());
+    let mut row2 = YmmRegister::load(in_vector[16..].as_ptr().cast());
+    let mut row3 = YmmRegister::load(in_vector[24..].as_ptr().cast());
+    let mut row4 = YmmRegister::load(in_vector[32..].as_ptr().cast());
+    let mut row5 = YmmRegister::load(in_vector[40..].as_ptr().cast());
+    let mut row6 = YmmRegister::load(in_vector[48..].as_ptr().cast());
+    let mut row7 = YmmRegister::load(in_vector[56..].as_ptr().cast());
+
+    // Forward DCT and quantization may cause all the AC terms to be zero, for such
+    // cases we can try to accelerate it
+
+    // Basically the poop is that whenever the array has 63 zeroes, its idct is
+    // (arr[0]>>3)or (arr[0]/8) propagated to all the elements.
+    // We first test to see if the array contains zero elements and if it does, we go the
+    // short way.
+    //
+    // This reduces IDCT overhead from about 39% to 18 %, almost half
+
+    // Do another load for the first row, we don't want to check DC value, because
+    // we only care about AC terms
+    // TODO this should be a shift/shuffle, not a likely unaligned load
+    let row8 = YmmRegister::load(in_vector[1..].as_ptr().cast());
+
+    let or_tree = (((row1 | row8) | (row2 | row3)) | ((row4 | row5) | (row6 | row7)));
+
+    if or_tree.all_zero() {
+        // AC terms all zero, idct of the block is  is ( coeff[0] * qt[0] )/8 + 128 (bias)
+        // (and clamped to 255)
+        let clamped_16 = ((in_vector[0] >> 3) + 128).clamp(0, 255) as i16;
+        let idct_value = vdupq_n_s16(clamped_16);
+
+        macro_rules! store {
+            ($pos:tt,$value:tt) => {
+                // store
+                vst1q_s16(
+                    out_vector
+                        .get_mut($pos..$pos + 8)
+                        .unwrap()
+                        .as_mut_ptr()
+                        .cast(),
+                    $value
+                );
+                $pos += stride;
+            };
+        }
+        store!(pos, idct_value);
+        store!(pos, idct_value);
+        store!(pos, idct_value);
+        store!(pos, idct_value);
+
+        store!(pos, idct_value);
+        store!(pos, idct_value);
+        store!(pos, idct_value);
+        store!(pos, idct_value);
+
+        return;
+    }
+
+    macro_rules! dct_pass {
+        ($SCALE_BITS:tt,$scale:tt) => {
+            // There are a lot of ways to do this
+            // but to keep it simple(and beautiful), ill make a direct translation of the
+            // scalar code to also make this code fully transparent(this version and the non
+            // avx one should produce identical code.)
+
+            // Compiler does a pretty good job of optimizing add + mul pairs
+            // into multiply-acumulate pairs
+
+            // even part
+            let p1 = (row2 + row6) * 2217;
+
+            let mut t2 = p1 + row6 * -7567;
+            let mut t3 = p1 + row2 * 3135;
+
+            let mut t0 = (row0 + row4).const_shl::<12>();
+            let mut t1 = (row0 - row4).const_shl::<12>();
+
+            let x0 = t0 + t3 + $SCALE_BITS;
+            let x3 = t0 - t3 + $SCALE_BITS;
+            let x1 = t1 + t2 + $SCALE_BITS;
+            let x2 = t1 - t2 + $SCALE_BITS;
+
+            let p3 = row7 + row3;
+            let p4 = row5 + row1;
+            let p1 = row7 + row1;
+            let p2 = row5 + row3;
+            let p5 = (p3 + p4) * 4816;
+
+            t0 = row7 * 1223;
+            t1 = row5 * 8410;
+            t2 = row3 * 12586;
+            t3 = row1 * 6149;
+
+            let p1 = p5 + p1 * -3685;
+            let p2 = p5 + (p2 * -10497);
+            let p3 = p3 * -8034;
+            let p4 = p4 * -1597;
+
+            t3 += p1 + p4;
+            t2 += p2 + p3;
+            t1 += p2 + p4;
+            t0 += p1 + p3;
+
+            row0 = (x0 + t3).const_shra::<$scale>();
+            row1 = (x1 + t2).const_shra::<$scale>();
+            row2 = (x2 + t1).const_shra::<$scale>();
+            row3 = (x3 + t0).const_shra::<$scale>();
+
+            row4 = (x3 - t0).const_shra::<$scale>();
+            row5 = (x2 - t1).const_shra::<$scale>();
+            row6 = (x1 - t2).const_shra::<$scale>();
+            row7 = (x0 - t3).const_shra::<$scale>();
+        };
+    }
+
+    // Process rows
+    dct_pass!(512, 10);
+    transpose(
+        &mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6, &mut row7
+    );
+
+    // process columns
+    dct_pass!(SCALE_BITS, 17);
+    transpose(
+        &mut row0, &mut row1, &mut row2, &mut row3, &mut row4, &mut row5, &mut row6, &mut row7
+    );
+
+    // Pack i32 to i16's,
+    // clamp them to be between 0-255
+    // Undo shuffling
+    // Store back to array
+
+    // This could potentially be reorganized to take advantage of the multi-register stores
+    macro_rules! permute_store {
+        ($x:tt,$y:tt,$index:tt,$out:tt) => {
+            let a = condense_bottom_16($x, $y);
+
+            // Clamp the values after packing, we can clamp more values at once
+            let b = clamp256_neon(a);
+
+            // store first vector
+            vst1q_s16(
+                ($out)
+                    .get_mut($index..$index + 8)
+                    .unwrap()
+                    .as_mut_ptr()
+                    .cast(),
+                b.0
+            );
+            $index += stride;
+            // second vector
+            vst1q_s16(
+                ($out)
+                    .get_mut($index..$index + 8)
+                    .unwrap()
+                    .as_mut_ptr()
+                    .cast(),
+                b.1
+            );
+            $index += stride;
+        };
+    }
+    // Pack and write the values back to the array
+    permute_store!((row0.mm256), (row1.mm256), pos, out_vector);
+    permute_store!((row2.mm256), (row3.mm256), pos, out_vector);
+    permute_store!((row4.mm256), (row5.mm256), pos, out_vector);
+    permute_store!((row6.mm256), (row7.mm256), pos, out_vector);
+}
+
+#[inline]
+#[target_feature(enable = "neon")]
+unsafe fn clamp_neon(reg: int16x8_t) -> int16x8_t {
+    let min_s = vdupq_n_s16(0);
+    let max_s = vdupq_n_s16(255);
+
+    let max_v = vmaxq_s16(reg, min_s); //max(a,0)
+    let min_v = vminq_s16(max_v, max_s); //min(max(a,0),255)
+    min_v
+}
+
+#[inline]
+#[target_feature(enable = "neon")]
+unsafe fn clamp256_neon(reg: int16x8x2_t) -> int16x8x2_t {
+    int16x8x2_t(clamp_neon(reg.0), clamp_neon(reg.1))
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_neon_clamp_256() {
+        unsafe {
+            let vals: [i16; 16] = [-1, -2, -3, 4, 256, 257, 258, 240, -1, 290, 2, 3, 4, 5, 6, 7];
+            let loaded = vld1q_s16_x2(vals.as_ptr().cast());
+            let shuffled = clamp256_neon(loaded);
+
+            let mut result: [i16; 16] = [0; 16];
+
+            vst1q_s16_x2(result.as_mut_ptr().cast(), shuffled);
+
+            assert_eq!(
+                result,
+                [0, 0, 0, 4, 255, 255, 255, 240, 0, 255, 2, 3, 4, 5, 6, 7]
+            )
+        }
+    }
+}
--- a/third_party/zune-jpeg/src/idct/scalar.rs
+++ b/third_party/zune-jpeg/src/idct/scalar.rs
@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+//! Platform independent IDCT algorithm
+//!
+//! Not as fast as AVX one.
+
+const SCALE_BITS: i32 = 512 + 65536 + (128 << 17);
+
+#[allow(unused_assignments)]
+#[allow(
+    clippy::too_many_lines,
+    clippy::op_ref,
+    clippy::cast_possible_truncation
+)]
+pub fn idct_int(in_vector: &mut [i32; 64], out_vector: &mut [i16], stride: usize) {
+    // Temporary variables.
+
+    let mut pos = 0;
+
+    let mut i = 0;
+    // Don't check for zeroes inside loop, lift it and check outside
+    // we want to accelerate the case with 63 0 ac coeff
+    if &in_vector[1..] == &[0_i32; 63] {
+        // okay then if you work, yay, let's write you really quick
+        let coeff = [(((in_vector[0] >> 3) + 128) as i16).clamp(0, 255); 8];
+
+        macro_rules! store {
+            ($index:tt) => {
+                // position of the MCU
+                let mcu_stride: &mut [i16; 8] = out_vector
+                    .get_mut($index..$index + 8)
+                    .unwrap()
+                    .try_into()
+                    .unwrap();
+                // copy coefficients
+                mcu_stride.copy_from_slice(&coeff);
+                // increment index
+                $index += stride;
+            };
+        }
+        // write to four positions
+        store!(pos);
+        store!(pos);
+        store!(pos);
+        store!(pos);
+
+        store!(pos);
+        store!(pos);
+        store!(pos);
+        store!(pos);
+    } else {
+        // because the compiler fails to see that it can be auto_vectorised so i'll
+        // leave it here check out [idct_int_slow, and idct_int_1D to get what i mean ] https://godbolt.org/z/8hqW9z9j9
+        for ptr in 0..8 {
+            let p2 = in_vector[ptr + 16];
+            let p3 = in_vector[ptr + 48];
+
+            let p1 = (p2 + p3).wrapping_mul(2217);
+
+            let t2 = p1 + p3 * -7567;
+            let t3 = p1 + p2 * 3135;
+
+            let p2 = in_vector[ptr];
+            let p3 = in_vector[32 + ptr];
+            let t0 = fsh(p2 + p3);
+            let t1 = fsh(p2 - p3);
+
+            let x0 = t0 + t3 + 512;
+            let x3 = t0 - t3 + 512;
+            let x1 = t1 + t2 + 512;
+            let x2 = t1 - t2 + 512;
+
+            // odd part
+            let mut t0 = in_vector[ptr + 56];
+            let mut t1 = in_vector[ptr + 40];
+            let mut t2 = in_vector[ptr + 24];
+            let mut t3 = in_vector[ptr + 8];
+
+            let p3 = t0 + t2;
+            let p4 = t1 + t3;
+            let p1 = t0 + t3;
+            let p2 = t1 + t2;
+            let p5 = (p3 + p4) * 4816;
+
+            t0 *= 1223;
+            t1 *= 8410;
+            t2 *= 12586;
+            t3 *= 6149;
+
+            let p1 = p5 + p1 * -3685;
+            let p2 = p5 + p2 * -10497;
+            let p3 = p3 * -8034;
+            let p4 = p4 * -1597;
+
+            t3 += p1 + p4;
+            t2 += p2 + p3;
+            t1 += p2 + p4;
+            t0 += p1 + p3;
+
+            // constants scaled things up by 1<<12; let's bring them back
+            // down, but keep 2 extra bits of precision
+            in_vector[ptr] = (x0 + t3) >> 10;
+            in_vector[ptr + 8] = (x1 + t2) >> 10;
+            in_vector[ptr + 16] = (x2 + t1) >> 10;
+            in_vector[ptr + 24] = (x3 + t0) >> 10;
+            in_vector[ptr + 32] = (x3 - t0) >> 10;
+            in_vector[ptr + 40] = (x2 - t1) >> 10;
+            in_vector[ptr + 48] = (x1 - t2) >> 10;
+            in_vector[ptr + 56] = (x0 - t3) >> 10;
+        }
+
+        // This is vectorised in architectures supporting SSE 4.1
+        while i < 64 {
+            // We won't try to short circuit here because it rarely works
+
+            // Even part
+            let p2 = in_vector[i + 2];
+            let p3 = in_vector[i + 6];
+
+            let p1 = (p2 + p3) * 2217;
+            let t2 = p1 + p3 * -7567;
+            let t3 = p1 + p2 * 3135;
+
+            let p2 = in_vector[i];
+            let p3 = in_vector[i + 4];
+
+            let t0 = fsh(p2 + p3);
+            let t1 = fsh(p2 - p3);
+            // constants scaled things up by 1<<12, plus we had 1<<2 from first
+            // loop, plus horizontal and vertical each scale by sqrt(8) so together
+            // we've got an extra 1<<3, so 1<<17 total we need to remove.
+            // so we want to round that, which means adding 0.5 * 1<<17,
+            // aka 65536. Also, we'll end up with -128 to 127 that we want
+            // to encode as 0..255 by adding 128, so we'll add that before the shift
+            let x0 = t0 + t3 + SCALE_BITS;
+            let x3 = t0 - t3 + SCALE_BITS;
+            let x1 = t1 + t2 + SCALE_BITS;
+            let x2 = t1 - t2 + SCALE_BITS;
+            // odd part
+            let mut t0 = in_vector[i + 7];
+            let mut t1 = in_vector[i + 5];
+            let mut t2 = in_vector[i + 3];
+            let mut t3 = in_vector[i + 1];
+
+            let p3 = t0 + t2;
+            let p4 = t1 + t3;
+            let p1 = t0 + t3;
+            let p2 = t1 + t2;
+            let p5 = (p3 + p4) * f2f(1.175875602);
+
+            t0 = t0.wrapping_mul(1223);
+            t1 = t1.wrapping_mul(8410);
+            t2 = t2.wrapping_mul(12586);
+            t3 = t3.wrapping_mul(6149);
+
+            let p1 = p5 + p1 * -3685;
+            let p2 = p5 + p2 * -10497;
+            let p3 = p3 * -8034;
+            let p4 = p4 * -1597;
+
+            t3 += p1 + p4;
+            t2 += p2 + p3;
+            t1 += p2 + p4;
+            t0 += p1 + p3;
+
+            let out: &mut [i16; 8] = out_vector
+                .get_mut(pos..pos + 8)
+                .unwrap()
+                .try_into()
+                .unwrap();
+
+            out[0] = clamp((x0 + t3) >> 17);
+            out[1] = clamp((x1 + t2) >> 17);
+            out[2] = clamp((x2 + t1) >> 17);
+            out[3] = clamp((x3 + t0) >> 17);
+            out[4] = clamp((x3 - t0) >> 17);
+            out[5] = clamp((x2 - t1) >> 17);
+            out[6] = clamp((x1 - t2) >> 17);
+            out[7] = clamp((x0 - t3) >> 17);
+
+            i += 8;
+
+            pos += stride;
+        }
+    }
+}
+
+#[inline]
+#[allow(clippy::cast_possible_truncation)]
+/// Multiply a number by 4096
+fn f2f(x: f32) -> i32 {
+    (x * 4096.0 + 0.5) as i32
+}
+
+#[inline]
+/// Multiply a number by 4096
+fn fsh(x: i32) -> i32 {
+    x << 12
+}
+
+/// Clamp values between 0 and 255
+#[inline]
+#[allow(clippy::cast_possible_truncation)]
+fn clamp(a: i32) -> i16 {
+    a.clamp(0, 255) as i16
+}
--- a/third_party/zune-jpeg/src/lib.rs
+++ b/third_party/zune-jpeg/src/lib.rs
@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+//!This crate provides a library for decoding valid
+//! ITU-T Rec. T.851 (09/2005) ITU-T T.81 (JPEG-1) or JPEG images.
+//!
+//!
+//!
+//! # Features
+//!  - SSE and AVX accelerated functions to speed up certain decoding operations
+//!  - FAST and accurate 32 bit IDCT algorithm
+//!  - Fast color convert functions
+//!  - RGBA and RGBX (4-Channel) color conversion functions
+//!  - YCbCr to Luma(Grayscale) conversion.
+//!
+//! # Usage
+//! Add zune-jpeg to the dependencies in the project Cargo.toml
+//!
+//! ```toml
+//! [dependencies]
+//! zune_jpeg = "0.3"
+//! ```
+//! # Examples
+//!
+//! ## Decode a JPEG file with default arguments.
+//!```no_run
+//! use std::fs::read;
+//! use std::io::BufReader;
+//! use zune_jpeg::JpegDecoder;
+//! let file_contents = BufReader::new(std::fs::File::open("a_jpeg.file").unwrap());
+//! let mut decoder = JpegDecoder::new(file_contents);
+//! let mut pixels = decoder.decode().unwrap();
+//! ```
+//!
+//! ## Decode a JPEG file to RGBA format
+//!
+//! - Other (limited) supported formats are and  BGR, BGRA
+//!
+//!```no_run
+//! use zune_core::bytestream::ZCursor;
+//! use zune_core::colorspace::ColorSpace;
+//! use zune_core::options::DecoderOptions;
+//! use zune_jpeg::JpegDecoder;
+//!
+//! let mut options = DecoderOptions::default().jpeg_set_out_colorspace(ColorSpace::RGBA);
+//!
+//! let mut decoder = JpegDecoder::new_with_options(ZCursor::new(&[]),options);
+//! let pixels = decoder.decode().unwrap();
+//! ```
+//!
+//! ## Decode an image and get it's width and height.
+//!```no_run
+//! use zune_core::bytestream::ZCursor;
+//! use zune_jpeg::JpegDecoder;
+//!
+//! let mut decoder = JpegDecoder::new(ZCursor::new(&[]));
+//! decoder.decode_headers().unwrap();
+//! let image_info = decoder.info().unwrap();
+//! println!("{},{}",image_info.width,image_info.height)
+//! ```
+//! # Crate features.
+//! This crate tries to be as minimal as possible while being extensible
+//! enough to handle the complexities arising from parsing different types
+//! of jpeg images.
+//!
+//! Safety is a top concern that is why we provide both static ways to disable unsafe code,
+//! disabling x86 feature, and dynamic ,by using [`DecoderOptions::set_use_unsafe(false)`],
+//! both of these disable platform specific optimizations, which reduce the speed of decompression.
+//!
+//! Please do note that careful consideration has been taken to ensure that the unsafe paths
+//! are only unsafe because they depend on platform specific intrinsics, hence no need to disable them
+//!
+//! The crate tries to decode as many images as possible, as a best effort, even those violating the standard
+//! , this means a lot of images may  get silent warnings and wrong output, but if you are sure you will be handling
+//! images that follow the spec, set `ZuneJpegOptions::set_strict` to true.
+//!
+//![`DecoderOptions::set_use_unsafe(false)`]:  https://docs.rs/zune-core/0.2.1/zune_core/options/struct.DecoderOptions.html#method.set_use_unsafe
+
+#![warn(
+    clippy::correctness,
+    clippy::perf,
+    clippy::pedantic,
+    clippy::inline_always,
+    clippy::missing_errors_doc,
+    clippy::panic
+)]
+#![allow(
+    clippy::needless_return,
+    clippy::similar_names,
+    clippy::inline_always,
+    clippy::similar_names,
+    clippy::doc_markdown,
+    clippy::module_name_repetitions,
+    clippy::missing_panics_doc,
+    clippy::missing_errors_doc
+)]
+// no_std compatibility
+#![deny(clippy::std_instead_of_alloc, clippy::alloc_instead_of_core)]
+#![cfg_attr(not(feature = "x86"), forbid(unsafe_code))]
+#![cfg_attr(not(feature = "std"), no_std)]
+#![macro_use]
+extern crate alloc;
+extern crate core;
+
+pub use zune_core;
+
+pub use crate::decoder::{ImageInfo, JpegDecoder};
+
+mod bitstream;
+mod color_convert;
+mod components;
+mod decoder;
+pub mod errors;
+mod headers;
+mod huffman;
+#[cfg(not(fuzzing))]
+mod idct;
+#[cfg(fuzzing)]
+pub mod idct;
+mod marker;
+mod mcu;
+mod mcu_prog;
+mod misc;
+mod unsafe_utils;
+mod unsafe_utils_avx2;
+mod unsafe_utils_neon;
+mod upsampler;
+mod worker;
--- a/third_party/zune-jpeg/src/marker.rs
+++ b/third_party/zune-jpeg/src/marker.rs
@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+#![allow(clippy::upper_case_acronyms)]
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum Marker {
+    /// Start Of Frame markers
+    ///
+    /// - SOF(0):  Baseline DCT (Huffman coding)
+    /// - SOF(1):  Extended sequential DCT (Huffman coding)
+    /// - SOF(2):  Progressive DCT (Huffman coding)
+    /// - SOF(3):  Lossless (sequential) (Huffman coding)
+    /// - SOF(5):  Differential sequential DCT (Huffman coding)
+    /// - SOF(6):  Differential progressive DCT (Huffman coding)
+    /// - SOF(7):  Differential lossless (sequential) (Huffman coding)
+    /// - SOF(9):  Extended sequential DCT (arithmetic coding)
+    /// - SOF(10): Progressive DCT (arithmetic coding)
+    /// - SOF(11): Lossless (sequential) (arithmetic coding)
+    /// - SOF(13): Differential sequential DCT (arithmetic coding)
+    /// - SOF(14): Differential progressive DCT (arithmetic coding)
+    /// - SOF(15): Differential lossless (sequential) (arithmetic coding)
+    SOF(u8),
+    /// Define Huffman table(s)
+    DHT,
+    /// Define arithmetic coding conditioning(s)
+    DAC,
+    /// Restart with modulo 8 count `m`
+    RST(u8),
+    /// Start of image
+    SOI,
+    /// End of image
+    EOI,
+    /// Start of scan
+    SOS,
+    /// Define quantization table(s)
+    DQT,
+    /// Define number of lines
+    DNL,
+    /// Define restart interval
+    DRI,
+    /// Reserved for application segments
+    APP(u8),
+    /// Comment
+    COM
+}
+
+impl Marker {
+    pub fn from_u8(n: u8) -> Option<Marker> {
+        use self::Marker::{APP, COM, DAC, DHT, DNL, DQT, DRI, EOI, RST, SOF, SOI, SOS};
+
+        match n {
+            0xFE => Some(COM),
+            0xC0 => Some(SOF(0)),
+            0xC1 => Some(SOF(1)),
+            0xC2 => Some(SOF(2)),
+            0xC4 => Some(DHT),
+            0xCC => Some(DAC),
+            0xD0 => Some(RST(0)),
+            0xD1 => Some(RST(1)),
+            0xD2 => Some(RST(2)),
+            0xD3 => Some(RST(3)),
+            0xD4 => Some(RST(4)),
+            0xD5 => Some(RST(5)),
+            0xD6 => Some(RST(6)),
+            0xD7 => Some(RST(7)),
+            0xD8 => Some(SOI),
+            0xD9 => Some(EOI),
+            0xDA => Some(SOS),
+            0xDB => Some(DQT),
+            0xDC => Some(DNL),
+            0xDD => Some(DRI),
+            0xE0 => Some(APP(0)),
+            0xE1 => Some(APP(1)),
+            0xE2 => Some(APP(2)),
+            0xEE => Some(APP(14)),
+            _ => None
+        }
+    }
+}
--- a/third_party/zune-jpeg/src/mcu.rs
+++ b/third_party/zune-jpeg/src/mcu.rs
@ -0,0 +1,504 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+use alloc::{format, vec};
+use core::cmp::min;
+
+use zune_core::bytestream::ZByteReaderTrait;
+use zune_core::colorspace::ColorSpace;
+use zune_core::colorspace::ColorSpace::Luma;
+use zune_core::log::{error, trace, warn};
+
+use crate::bitstream::BitStream;
+use crate::components::SampleRatios;
+use crate::decoder::MAX_COMPONENTS;
+use crate::errors::DecodeErrors;
+use crate::marker::Marker;
+use crate::misc::{calculate_padded_width, setup_component_params};
+use crate::worker::{color_convert, upsample};
+use crate::JpegDecoder;
+
+/// The size of a DC block for a MCU.
+
+pub const DCT_BLOCK: usize = 64;
+
+impl<T: ZByteReaderTrait> JpegDecoder<T> {
+    /// Check for existence of DC and AC Huffman Tables
+    pub(crate) fn check_tables(&self) -> Result<(), DecodeErrors> {
+        // check that dc and AC tables exist outside the hot path
+        for component in &self.components {
+            let _ = &self
+                .dc_huffman_tables
+                .get(component.dc_huff_table)
+                .as_ref()
+                .ok_or_else(|| {
+                    DecodeErrors::HuffmanDecode(format!(
+                        "No Huffman DC table for component {:?} ",
+                        component.component_id
+                    ))
+                })?
+                .as_ref()
+                .ok_or_else(|| {
+                    DecodeErrors::HuffmanDecode(format!(
+                        "No DC table for component {:?}",
+                        component.component_id
+                    ))
+                })?;
+
+            let _ = &self
+                .ac_huffman_tables
+                .get(component.ac_huff_table)
+                .as_ref()
+                .ok_or_else(|| {
+                    DecodeErrors::HuffmanDecode(format!(
+                        "No Huffman AC table for component {:?} ",
+                        component.component_id
+                    ))
+                })?
+                .as_ref()
+                .ok_or_else(|| {
+                    DecodeErrors::HuffmanDecode(format!(
+                        "No AC table for component {:?}",
+                        component.component_id
+                    ))
+                })?;
+        }
+        Ok(())
+    }
+
+    /// Decode MCUs and carry out post processing.
+    ///
+    /// This is the main decoder loop for the library, the hot path.
+    ///
+    /// Because of this, we pull in some very crazy optimization tricks hence readability is a pinch
+    /// here.
+    #[allow(
+        clippy::similar_names,
+        clippy::too_many_lines,
+        clippy::cast_possible_truncation
+    )]
+    #[inline(never)]
+    pub(crate) fn decode_mcu_ycbcr_baseline(
+        &mut self, pixels: &mut [u8]
+    ) -> Result<(), DecodeErrors> {
+        setup_component_params(self)?;
+
+        // check dc and AC tables
+        self.check_tables()?;
+
+        let (mut mcu_width, mut mcu_height);
+
+        if self.is_interleaved {
+            // set upsampling functions
+            self.set_upsampling()?;
+
+            mcu_width = self.mcu_x;
+            mcu_height = self.mcu_y;
+        } else {
+            // For non-interleaved images( (1*1) subsampling)
+            // number of MCU's are the widths (+7 to account for paddings) divided bu 8.
+            mcu_width = ((self.info.width + 7) / 8) as usize;
+            mcu_height = ((self.info.height + 7) / 8) as usize;
+        }
+        if self.is_interleaved
+            && self.input_colorspace.num_components() > 1
+            && self.options.jpeg_get_out_colorspace().num_components() == 1
+            && (self.sub_sample_ratio == SampleRatios::V
+                || self.sub_sample_ratio == SampleRatios::HV)
+        {
+            // For a specific set of images, e.g interleaved,
+            // when converting from YcbCr to grayscale, we need to
+            // take into account mcu height since the MCU decoding needs to take
+            // it into account for padding purposes and the post processor
+            // parses two rows per mcu width.
+            //
+            // set coeff to be 2 to ensure that we increment two rows
+            // for every mcu processed also
+            mcu_height *= self.v_max;
+            mcu_height /= self.h_max;
+            self.coeff = 2;
+        }
+
+        if self.input_colorspace.num_components() > self.components.len() {
+            let msg = format!(
+                " Expected {} number of components but found {}",
+                self.input_colorspace.num_components(),
+                self.components.len()
+            );
+            return Err(DecodeErrors::Format(msg));
+        }
+
+        if self.input_colorspace == ColorSpace::Luma && self.is_interleaved {
+            warn!("Grayscale image with down-sampled component, resetting component details");
+
+            self.reset_params();
+
+            mcu_width = ((self.info.width + 7) / 8) as usize;
+            mcu_height = ((self.info.height + 7) / 8) as usize;
+        }
+        let width = usize::from(self.info.width);
+
+        let padded_width = calculate_padded_width(width, self.sub_sample_ratio);
+
+        let mut stream = BitStream::new();
+        let mut tmp = [0_i32; DCT_BLOCK];
+
+        let comp_len = self.components.len();
+
+        for (pos, comp) in self.components.iter_mut().enumerate() {
+            // Allocate only needed components.
+            //
+            // For special colorspaces i.e YCCK and CMYK, just allocate all of the needed
+            // components.
+            if min(
+                self.options.jpeg_get_out_colorspace().num_components() - 1,
+                pos
+            ) == pos
+                || comp_len == 4
+            // Special colorspace
+            {
+                // allocate enough space to hold a whole MCU width
+                // this means we should take into account sampling ratios
+                // `*8` is because each MCU spans 8 widths.
+                let len = comp.width_stride * comp.vertical_sample * 8;
+
+                comp.needed = true;
+                comp.raw_coeff = vec![0; len];
+            } else {
+                comp.needed = false;
+            }
+        }
+
+        let mut pixels_written = 0;
+
+        let is_hv = usize::from(self.is_interleaved);
+        let upsampler_scratch_size = is_hv * self.components[0].width_stride;
+        let mut upsampler_scratch_space = vec![0; upsampler_scratch_size];
+
+        for i in 0..mcu_height {
+            // Report if we have no more bytes
+            // This may generate false negatives since we over-read bytes
+            // hence that why 37 is chosen(we assume if we over-read more than 37 bytes, we have a problem)
+            if stream.overread_by > 37
+            // favourite number :)
+            {
+                if self.options.strict_mode() {
+                    return Err(DecodeErrors::FormatStatic("Premature end of buffer"));
+                };
+
+                error!("Premature end of buffer");
+                break;
+            }
+            // decode a whole MCU width,
+            // this takes into account interleaved components.
+            self.decode_mcu_width(mcu_width, &mut tmp, &mut stream)?;
+            // process that width up until it's impossible
+            self.post_process(
+                pixels,
+                i,
+                mcu_height,
+                width,
+                padded_width,
+                &mut pixels_written,
+                &mut upsampler_scratch_space
+            )?;
+        }
+        // it may happen that some images don't have the whole buffer
+        // so we can't panic in case of that
+        // assert_eq!(pixels_written, pixels.len());
+
+        trace!("Finished decoding image");
+
+        Ok(())
+    }
+    fn decode_mcu_width(
+        &mut self, mcu_width: usize, tmp: &mut [i32; 64], stream: &mut BitStream
+    ) -> Result<(), DecodeErrors> {
+        for j in 0..mcu_width {
+            // iterate over components
+            for component in &mut self.components {
+                let dc_table = self.dc_huffman_tables[component.dc_huff_table % MAX_COMPONENTS]
+                    .as_ref()
+                    .unwrap();
+
+                let ac_table = self.ac_huffman_tables[component.ac_huff_table % MAX_COMPONENTS]
+                    .as_ref()
+                    .unwrap();
+
+                let qt_table = &component.quantization_table;
+                let channel = &mut component.raw_coeff;
+
+                // If image is interleaved iterate over scan components,
+                // otherwise if it-s non-interleaved, these routines iterate in
+                // trivial scanline order(Y,Cb,Cr)
+                for v_samp in 0..component.vertical_sample {
+                    for h_samp in 0..component.horizontal_sample {
+                        // Fill the array with zeroes, decode_mcu_block expects
+                        // a zero based array.
+                        tmp.fill(0);
+
+                        stream.decode_mcu_block(
+                            &mut self.stream,
+                            dc_table,
+                            ac_table,
+                            qt_table,
+                            tmp,
+                            &mut component.dc_pred
+                        )?;
+
+                        if component.needed {
+                            let idct_position = {
+                                // derived from stb and rewritten for my tastes
+                                let c2 = v_samp * 8;
+                                let c3 = ((j * component.horizontal_sample) + h_samp) * 8;
+
+                                component.width_stride * c2 + c3
+                            };
+
+                            let idct_pos = channel.get_mut(idct_position..).unwrap();
+                            //  call idct.
+                            (self.idct_func)(tmp, idct_pos, component.width_stride);
+                        }
+                    }
+                }
+            }
+            self.todo = self.todo.saturating_sub(1);
+            // After all interleaved components, that's an MCU
+            // handle stream markers
+            //
+            // In some corrupt images, it may occur that header markers occur in the stream.
+            // The spec EXPLICITLY FORBIDS this, specifically, in
+            // routine F.2.2.5  it says
+            // `The only valid marker which may occur within the Huffman coded data is the RSTm marker.`
+            //
+            // But libjpeg-turbo allows it because of some weird reason. so I'll also
+            // allow it because of some weird reason.
+            if let Some(m) = stream.marker {
+                if m == Marker::EOI {
+                    // acknowledge and ignore EOI marker.
+                    stream.marker.take();
+                    trace!("Found EOI marker");
+                    // Google Introduced the Ultra-HD image format which is basically
+                    // stitching two images into one container.
+                    // They basically separate two images via a EOI and SOI marker
+                    // so let's just ensure if we ever see EOI, we never read past that
+                    // ever.
+                    // https://github.com/google/libultrahdr
+                    stream.seen_eoi = true;
+                } else if let Marker::RST(_) = m {
+                    if self.todo == 0 {
+                        self.handle_rst(stream)?;
+                    }
+                } else {
+                    if self.options.strict_mode() {
+                        return Err(DecodeErrors::Format(format!(
+                            "Marker {m:?} found where not expected"
+                        )));
+                    }
+                    error!(
+                        "Marker `{:?}` Found within Huffman Stream, possibly corrupt jpeg",
+                        m
+                    );
+
+                    self.parse_marker_inner(m)?;
+                }
+            }
+        }
+        Ok(())
+    }
+    // handle RST markers.
+    // No-op if not using restarts
+    // this routine is shared with mcu_prog
+    #[cold]
+    pub(crate) fn handle_rst(&mut self, stream: &mut BitStream) -> Result<(), DecodeErrors> {
+        self.todo = self.restart_interval;
+
+        if let Some(marker) = stream.marker {
+            // Found a marker
+            // Read stream and see what marker is stored there
+            match marker {
+                Marker::RST(_) => {
+                    // reset stream
+                    stream.reset();
+                    // Initialize dc predictions to zero for all components
+                    self.components.iter_mut().for_each(|x| x.dc_pred = 0);
+                    // Start iterating again. from position.
+                }
+                Marker::EOI => {
+                    // silent pass
+                }
+                _ => {
+                    return Err(DecodeErrors::MCUError(format!(
+                        "Marker {marker:?} found in bitstream, possibly corrupt jpeg"
+                    )));
+                }
+            }
+        }
+        Ok(())
+    }
+    #[allow(clippy::too_many_lines, clippy::too_many_arguments)]
+    pub(crate) fn post_process(
+        &mut self, pixels: &mut [u8], i: usize, mcu_height: usize, width: usize,
+        padded_width: usize, pixels_written: &mut usize, upsampler_scratch_space: &mut [i16]
+    ) -> Result<(), DecodeErrors> {
+        let out_colorspace_components = self.options.jpeg_get_out_colorspace().num_components();
+
+        let mut px = *pixels_written;
+        // indicates whether image is vertically up-sampled
+        let is_vertically_sampled = self
+            .components
+            .iter()
+            .any(|c| c.sample_ratio == SampleRatios::HV || c.sample_ratio == SampleRatios::V);
+
+        let mut comp_len = self.components.len();
+
+        // If we are moving from YCbCr-> Luma, we do not allocate storage for other components, so we
+        // will panic when we are trying to read samples, so for that case,
+        // hardcode it so that we  don't panic when doing
+        //   *samp = &samples[j][pos * padded_width..(pos + 1) * padded_width]
+        if out_colorspace_components < comp_len && self.options.jpeg_get_out_colorspace() == Luma {
+            comp_len = out_colorspace_components;
+        }
+        let mut color_conv_function =
+            |num_iters: usize, samples: [&[i16]; 4]| -> Result<(), DecodeErrors> {
+                for (pos, output) in pixels[px..]
+                    .chunks_exact_mut(width * out_colorspace_components)
+                    .take(num_iters)
+                    .enumerate()
+                {
+                    let mut raw_samples: [&[i16]; 4] = [&[], &[], &[], &[]];
+
+                    // iterate over each line, since color-convert needs only
+                    // one line
+                    for (j, samp) in raw_samples.iter_mut().enumerate().take(comp_len) {
+                        *samp = &samples[j][pos * padded_width..(pos + 1) * padded_width];
+                    }
+                    color_convert(
+                        &raw_samples,
+                        self.color_convert_16,
+                        self.input_colorspace,
+                        self.options.jpeg_get_out_colorspace(),
+                        output,
+                        width,
+                        padded_width
+                    )?;
+                    px += width * out_colorspace_components;
+                }
+                Ok(())
+            };
+
+        let comps = &mut self.components[..];
+
+        if self.is_interleaved && self.options.jpeg_get_out_colorspace() != ColorSpace::Luma {
+            {
+                // duplicated so that we can check that samples match
+                // Fixes bug https://github.com/etemesi254/zune-image/issues/151
+                let mut samples: [&[i16]; 4] = [&[], &[], &[], &[]];
+
+                for (samp, component) in samples.iter_mut().zip(comps.iter()) {
+                    *samp = if component.sample_ratio == SampleRatios::None {
+                        &component.raw_coeff
+                    } else {
+                        &component.upsample_dest
+                    };
+                }
+            }
+            for comp in comps.iter_mut() {
+                upsample(comp, mcu_height, i, upsampler_scratch_space);
+            }
+
+            if is_vertically_sampled {
+                if i > 0 {
+                    // write the last line, it wasn't  up-sampled as we didn't have row_down
+                    // yet
+                    let mut samples: [&[i16]; 4] = [&[], &[], &[], &[]];
+
+                    for (samp, component) in samples.iter_mut().zip(comps.iter()) {
+                        *samp = &component.first_row_upsample_dest;
+                    }
+
+                    // ensure length matches for all samples
+                    let first_len = samples[0].len();
+                    for samp in samples.iter().take(comp_len) {
+                        assert_eq!(first_len, samp.len());
+                    }
+                    let num_iters = self.coeff * self.v_max;
+
+                    color_conv_function(num_iters, samples)?;
+                }
+
+                // After upsampling the last row, save  any row that can be used for
+                // a later upsampling,
+                //
+                // E.g the Y sample is not sampled but we haven't finished upsampling the last row of
+                // the previous mcu, since we don't have the down row, so save it
+                for component in comps.iter_mut() {
+                    // copy last row to be used for the  next color conversion
+                    let size = component.vertical_sample
+                        * component.width_stride
+                        * component.sample_ratio.sample();
+
+                    let last_bytes = component.raw_coeff.rchunks_exact_mut(size).next().unwrap();
+
+                    component
+                        .first_row_upsample_dest
+                        .copy_from_slice(last_bytes);
+                }
+            }
+
+            let mut samples: [&[i16]; 4] = [&[], &[], &[], &[]];
+
+            for (samp, component) in samples.iter_mut().zip(comps.iter()) {
+                *samp = if component.sample_ratio == SampleRatios::None {
+                    &component.raw_coeff
+                } else {
+                    &component.upsample_dest
+                };
+            }
+
+            // we either do 7 or 8 MCU's depending on the state, this only applies to
+            // vertically sampled images
+            //
+            // for rows up until the last MCU, we do not upsample the last stride of the MCU
+            // which means that the number of iterations should take that into account is one less the
+            // up-sampled size
+            //
+            // For the last MCU, we upsample the last stride, meaning that if we hit the last MCU, we
+            // should sample full raw coeffs
+            let is_last_considered = is_vertically_sampled && (i != mcu_height.saturating_sub(1));
+
+            let num_iters = (8 - usize::from(is_last_considered)) * self.coeff * self.v_max;
+
+            color_conv_function(num_iters, samples)?;
+        } else {
+            let mut channels_ref: [&[i16]; MAX_COMPONENTS] = [&[]; MAX_COMPONENTS];
+
+            self.components
+                .iter()
+                .enumerate()
+                .for_each(|(pos, x)| channels_ref[pos] = &x.raw_coeff);
+
+            color_conv_function(8 * self.coeff, channels_ref)?;
+        }
+
+        *pixels_written = px;
+        Ok(())
+    }
+}
+// #[cfg(test)]
+// mod tests {
+//     use zune_core::bytestream::ZCursor;
+//
+//     use crate::JpegDecoder;
+//
+//     #[test]
+//     fn im() {
+//         let image = std::fs::read("/home/caleb/Downloads/re.jpg").unwrap();
+//         JpegDecoder::new(ZCursor::new(&image)).decode().unwrap();
+//     }
+// }
--- a/third_party/zune-jpeg/src/mcu_prog.rs
+++ b/third_party/zune-jpeg/src/mcu_prog.rs
@ -0,0 +1,617 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+//!Routines for progressive decoding
+/*
+This file is needlessly complicated,
+
+It is that way to ensure we don't burn memory anyhow
+
+Memory is a scarce resource in some environments, I would like this to be viable
+in such environments
+
+Half of the complexity comes from the jpeg spec, because progressive decoding,
+is one hell of a ride.
+
+*/
+use alloc::string::ToString;
+use alloc::vec::Vec;
+use alloc::{format, vec};
+use core::cmp::min;
+
+use zune_core::bytestream::{ZByteReaderTrait, ZReader};
+use zune_core::colorspace::ColorSpace;
+use zune_core::log::{debug, error, warn};
+
+use crate::bitstream::BitStream;
+use crate::components::{ComponentID, SampleRatios};
+use crate::decoder::{JpegDecoder, MAX_COMPONENTS};
+use crate::errors::DecodeErrors;
+use crate::errors::DecodeErrors::Format;
+use crate::headers::{parse_huffman, parse_sos};
+use crate::marker::Marker;
+use crate::mcu::DCT_BLOCK;
+use crate::misc::{calculate_padded_width, setup_component_params};
+
+impl<T: ZByteReaderTrait> JpegDecoder<T> {
+    /// Decode a progressive image
+    ///
+    /// This routine decodes a progressive image, stopping if it finds any error.
+    #[allow(
+        clippy::needless_range_loop,
+        clippy::cast_sign_loss,
+        clippy::redundant_else,
+        clippy::too_many_lines
+    )]
+    #[inline(never)]
+    pub(crate) fn decode_mcu_ycbcr_progressive(
+        &mut self, pixels: &mut [u8]
+    ) -> Result<(), DecodeErrors> {
+        setup_component_params(self)?;
+
+        let mut mcu_height;
+
+        // memory location for decoded pixels for components
+        let mut block: [Vec<i16>; MAX_COMPONENTS] = [vec![], vec![], vec![], vec![]];
+        let mut mcu_width;
+
+        let mut seen_scans = 1;
+
+        if self.input_colorspace == ColorSpace::Luma && self.is_interleaved {
+            warn!("Grayscale image with down-sampled component, resetting component details");
+            self.reset_params();
+        }
+
+        if self.is_interleaved {
+            // this helps us catch component errors.
+            self.set_upsampling()?;
+        }
+        if self.is_interleaved {
+            mcu_width = self.mcu_x;
+            mcu_height = self.mcu_y;
+        } else {
+            mcu_width = (self.info.width as usize + 7) / 8;
+            mcu_height = (self.info.height as usize + 7) / 8;
+        }
+        if self.is_interleaved
+            && self.input_colorspace.num_components() > 1
+            && self.options.jpeg_get_out_colorspace().num_components() == 1
+            && (self.sub_sample_ratio == SampleRatios::V
+                || self.sub_sample_ratio == SampleRatios::HV)
+        {
+            // For a specific set of images, e.g interleaved,
+            // when converting from YcbCr to grayscale, we need to
+            // take into account mcu height since the MCU decoding needs to take
+            // it into account for padding purposes and the post processor
+            // parses two rows per mcu width.
+            //
+            // set coeff to be 2 to ensure that we increment two rows
+            // for every mcu processed also
+            mcu_height *= self.v_max;
+            mcu_height /= self.h_max;
+            self.coeff = 2;
+        }
+
+        mcu_width *= 64;
+
+        if self.input_colorspace.num_components() > self.components.len() {
+            let msg = format!(
+                " Expected {} number of components but found {}",
+                self.input_colorspace.num_components(),
+                self.components.len()
+            );
+            return Err(DecodeErrors::Format(msg));
+        }
+        for i in 0..self.input_colorspace.num_components() {
+            let comp = &self.components[i];
+            let len = mcu_width * comp.vertical_sample * comp.horizontal_sample * mcu_height;
+
+            block[i] = vec![0; len];
+        }
+
+        let mut stream = BitStream::new_progressive(
+            self.succ_high,
+            self.succ_low,
+            self.spec_start,
+            self.spec_end
+        );
+
+        // there are multiple scans in the stream, this should resolve the first scan
+        self.parse_entropy_coded_data(&mut stream, &mut block)?;
+
+        // extract marker
+        let mut marker = stream
+            .marker
+            .take()
+            .ok_or(DecodeErrors::FormatStatic("Marker missing where expected"))?;
+
+        // if marker is EOI, we are done, otherwise continue scanning.
+        //
+        // In case we have a premature image, we print a warning or return
+        // an error, depending on the strictness of the decoder, so there
+        // is that logic to handle too
+        'eoi: while marker != Marker::EOI {
+            match marker {
+                Marker::DHT => {
+                    parse_huffman(self)?;
+                }
+                Marker::SOS => {
+                    parse_sos(self)?;
+
+                    stream.update_progressive_params(
+                        self.succ_high,
+                        self.succ_low,
+                        self.spec_start,
+                        self.spec_end
+                    );
+
+                    // after every SOS, marker, parse data for that scan.
+                    self.parse_entropy_coded_data(&mut stream, &mut block)?;
+                    // extract marker, might either indicate end of image or we continue
+                    // scanning(hence the continue statement to determine).
+                    match get_marker(&mut self.stream, &mut stream) {
+                        Ok(marker_n) => {
+                            marker = marker_n;
+                            seen_scans += 1;
+                            if seen_scans > self.options.jpeg_get_max_scans() {
+                                return Err(DecodeErrors::Format(format!(
+                                    "Too many scans, exceeded limit of {}",
+                                    self.options.jpeg_get_max_scans()
+                                )));
+                            }
+
+                            stream.reset();
+                            continue 'eoi;
+                        }
+                        Err(msg) => {
+                            if self.options.strict_mode() {
+                                return Err(msg);
+                            }
+                            error!("{:?}", msg);
+                            break 'eoi;
+                        }
+                    }
+                }
+                _ => {
+                    break 'eoi;
+                }
+            }
+
+            match get_marker(&mut self.stream, &mut stream) {
+                Ok(marker_n) => {
+                    marker = marker_n;
+                }
+                Err(e) => {
+                    if self.options.strict_mode() {
+                        return Err(e);
+                    }
+                    error!("{}", e);
+                }
+            }
+        }
+
+        self.finish_progressive_decoding(&block, mcu_width, pixels)
+    }
+
+    #[allow(clippy::too_many_lines, clippy::cast_sign_loss)]
+    fn parse_entropy_coded_data(
+        &mut self, stream: &mut BitStream, buffer: &mut [Vec<i16>; MAX_COMPONENTS]
+    ) -> Result<(), DecodeErrors> {
+        stream.reset();
+        self.components.iter_mut().for_each(|x| x.dc_pred = 0);
+
+        if usize::from(self.num_scans) > self.input_colorspace.num_components() {
+            return Err(Format(format!(
+                "Number of scans {} cannot be greater than number of components, {}",
+                self.num_scans,
+                self.input_colorspace.num_components()
+            )));
+        }
+
+        if self.num_scans == 1 {
+            // Safety checks
+            if self.spec_end != 0 && self.spec_start == 0 {
+                return Err(DecodeErrors::FormatStatic(
+                    "Can't merge DC and AC corrupt jpeg"
+                ));
+            }
+            // non interleaved data, process one block at a time in trivial scanline order
+
+            let k = self.z_order[0];
+
+            if k >= self.components.len() {
+                return Err(DecodeErrors::Format(format!(
+                    "Cannot find component {k}, corrupt image"
+                )));
+            }
+
+            let (mcu_width, mcu_height);
+
+            if self.components[k].component_id == ComponentID::Y
+                && (self.components[k].vertical_sample != 1
+                    || self.components[k].horizontal_sample != 1)
+                || !self.is_interleaved
+            {
+                // For Y channel  or non interleaved scans ,
+                // mcu's is the image dimensions divided by 8
+                mcu_width = ((self.info.width + 7) / 8) as usize;
+                mcu_height = ((self.info.height + 7) / 8) as usize;
+            } else {
+                // For other channels, in an interleaved mcu, number of MCU's
+                // are determined by some weird maths done in headers.rs->parse_sos()
+                mcu_width = self.mcu_x;
+                mcu_height = self.mcu_y;
+            }
+
+            for i in 0..mcu_height {
+                for j in 0..mcu_width {
+                    if self.spec_start != 0 && self.succ_high == 0 && stream.eob_run > 0 {
+                        // handle EOB runs here.
+                        stream.eob_run -= 1;
+                        continue;
+                    }
+                    let start = 64 * (j + i * (self.components[k].width_stride / 8));
+
+                    let data: &mut [i16; 64] = buffer
+                        .get_mut(k)
+                        .unwrap()
+                        .get_mut(start..start + 64)
+                        .unwrap()
+                        .try_into()
+                        .unwrap();
+
+                    if self.spec_start == 0 {
+                        let pos = self.components[k].dc_huff_table & (MAX_COMPONENTS - 1);
+                        let dc_table = self
+                            .dc_huffman_tables
+                            .get(pos)
+                            .ok_or(DecodeErrors::FormatStatic(
+                                "No huffman table for DC component"
+                            ))?
+                            .as_ref()
+                            .ok_or(DecodeErrors::FormatStatic(
+                                "Huffman table at index  {} not initialized"
+                            ))?;
+
+                        let dc_pred = &mut self.components[k].dc_pred;
+
+                        if self.succ_high == 0 {
+                            // first scan for this mcu
+                            stream.decode_prog_dc_first(
+                                &mut self.stream,
+                                dc_table,
+                                &mut data[0],
+                                dc_pred
+                            )?;
+                        } else {
+                            // refining scans for this MCU
+                            stream.decode_prog_dc_refine(&mut self.stream, &mut data[0])?;
+                        }
+                    } else {
+                        let pos = self.components[k].ac_huff_table;
+                        let ac_table = self
+                            .ac_huffman_tables
+                            .get(pos)
+                            .ok_or_else(|| {
+                                DecodeErrors::Format(format!(
+                                    "No huffman table for component:{pos}"
+                                ))
+                            })?
+                            .as_ref()
+                            .ok_or_else(|| {
+                                DecodeErrors::Format(format!(
+                                    "Huffman table at index  {pos} not initialized"
+                                ))
+                            })?;
+
+                        if self.succ_high == 0 {
+                            debug_assert!(stream.eob_run == 0, "EOB run is not zero");
+
+                            stream.decode_mcu_ac_first(&mut self.stream, ac_table, data)?;
+                        } else {
+                            // refinement scan
+                            stream.decode_mcu_ac_refine(&mut self.stream, ac_table, data)?;
+                        }
+                    }
+                    // + EOB and investigate effect.
+                    self.todo -= 1;
+
+                    if self.todo == 0 {
+                        self.handle_rst(stream)?;
+                    }
+                }
+            }
+        } else {
+            if self.spec_end != 0 {
+                return Err(DecodeErrors::HuffmanDecode(
+                    "Can't merge dc and AC corrupt jpeg".to_string()
+                ));
+            }
+            // process scan n elements in order
+
+            // Do the error checking with allocs here.
+            // Make the one in the inner loop free of allocations.
+            for k in 0..self.num_scans {
+                let n = self.z_order[k as usize];
+
+                if n >= self.components.len() {
+                    return Err(DecodeErrors::Format(format!(
+                        "Cannot find component {n}, corrupt image"
+                    )));
+                }
+
+                let component = &mut self.components[n];
+                let _ = self
+                    .dc_huffman_tables
+                    .get(component.dc_huff_table)
+                    .ok_or_else(|| {
+                        DecodeErrors::Format(format!(
+                            "No huffman table for component:{}",
+                            component.dc_huff_table
+                        ))
+                    })?
+                    .as_ref()
+                    .ok_or_else(|| {
+                        DecodeErrors::Format(format!(
+                            "Huffman table at index  {} not initialized",
+                            component.dc_huff_table
+                        ))
+                    })?;
+            }
+            // Interleaved scan
+
+            // Components shall not be interleaved in progressive mode, except for
+            // the DC coefficients in the first scan for each component of a progressive frame.
+            for i in 0..self.mcu_y {
+                for j in 0..self.mcu_x {
+                    // process scan n elements in order
+                    for k in 0..self.num_scans {
+                        let n = self.z_order[k as usize];
+                        let component = &mut self.components[n];
+                        let huff_table = self
+                            .dc_huffman_tables
+                            .get(component.dc_huff_table)
+                            .ok_or(DecodeErrors::FormatStatic("No huffman table for component"))?
+                            .as_ref()
+                            .ok_or(DecodeErrors::FormatStatic(
+                                "Huffman table at index not initialized"
+                            ))?;
+
+                        for v_samp in 0..component.vertical_sample {
+                            for h_samp in 0..component.horizontal_sample {
+                                let x2 = j * component.horizontal_sample + h_samp;
+                                let y2 = i * component.vertical_sample + v_samp;
+                                let position = 64 * (x2 + y2 * component.width_stride / 8);
+
+                                let data = &mut buffer[n][position];
+
+                                if self.succ_high == 0 {
+                                    stream.decode_prog_dc_first(
+                                        &mut self.stream,
+                                        huff_table,
+                                        data,
+                                        &mut component.dc_pred
+                                    )?;
+                                } else {
+                                    stream.decode_prog_dc_refine(&mut self.stream, data)?;
+                                }
+                            }
+                        }
+                    }
+                    // We want wrapping subtraction here because it means
+                    // we get a higher number in the case this underflows
+                    self.todo = self.todo.wrapping_sub(1);
+                    // after every scan that's a mcu, count down restart markers.
+                    if self.todo == 0 {
+                        self.handle_rst(stream)?;
+                    }
+                }
+            }
+        }
+        return Ok(());
+    }
+
+    #[allow(clippy::too_many_lines)]
+    #[allow(clippy::needless_range_loop, clippy::cast_sign_loss)]
+    fn finish_progressive_decoding(
+        &mut self, block: &[Vec<i16>; MAX_COMPONENTS], _mcu_width: usize, pixels: &mut [u8]
+    ) -> Result<(), DecodeErrors> {
+        // This function is complicated because we need to replicate
+        // the function in mcu.rs
+        //
+        // The advantage is that we do very little allocation and very lot
+        // channel reusing.
+        // The trick is to notice that we repeat the same procedure per MCU
+        // width.
+        //
+        // So we can set it up that we only allocate temporary storage large enough
+        // to store a single mcu width, then reuse it per invocation.
+        //
+        // This is advantageous to us.
+        //
+        // Remember we need to have the whole MCU buffer so we store 3 unprocessed
+        // channels in memory, and then we allocate the whole output buffer in memory, both of
+        // which are huge.
+        //
+        //
+
+        let mcu_height = if self.is_interleaved {
+            self.mcu_y
+        } else {
+            // For non-interleaved images( (1*1) subsampling)
+            // number of MCU's are the widths (+7 to account for paddings) divided by 8.
+            ((self.info.height + 7) / 8) as usize
+        };
+
+        // Size of our output image(width*height)
+        let is_hv = usize::from(self.is_interleaved);
+        let upsampler_scratch_size = is_hv * self.components[0].width_stride;
+        let width = usize::from(self.info.width);
+        let padded_width = calculate_padded_width(width, self.sub_sample_ratio);
+
+        //let mut pixels = vec![0; capacity * out_colorspace_components];
+        let mut upsampler_scratch_space = vec![0; upsampler_scratch_size];
+        let mut tmp = [0_i32; DCT_BLOCK];
+
+        for (pos, comp) in self.components.iter_mut().enumerate() {
+            // Allocate only needed components.
+            //
+            // For special colorspaces i.e YCCK and CMYK, just allocate all of the needed
+            // components.
+            if min(
+                self.options.jpeg_get_out_colorspace().num_components() - 1,
+                pos
+            ) == pos
+                || self.input_colorspace == ColorSpace::YCCK
+                || self.input_colorspace == ColorSpace::CMYK
+            {
+                // allocate enough space to hold a whole MCU width
+                // this means we should take into account sampling ratios
+                // `*8` is because each MCU spans 8 widths.
+                let len = comp.width_stride * comp.vertical_sample * 8;
+
+                comp.needed = true;
+                comp.raw_coeff = vec![0; len];
+            } else {
+                comp.needed = false;
+            }
+        }
+
+        let mut pixels_written = 0;
+
+        // dequantize, idct and color convert.
+        for i in 0..mcu_height {
+            'component: for (position, component) in &mut self.components.iter_mut().enumerate() {
+                if !component.needed {
+                    continue 'component;
+                }
+                let qt_table = &component.quantization_table;
+
+                // step is the number of pixels this iteration wil be handling
+                // Given by the number of mcu's height and the length of the component block
+                // Since the component block contains the whole channel as raw pixels
+                // we this evenly divides the pixels into MCU blocks
+                //
+                // For interleaved images, this gives us the exact pixels comprising a whole MCU
+                // block
+                let step = block[position].len() / mcu_height;
+                // where we will be reading our pixels from.
+                let start = i * step;
+
+                let slice = &block[position][start..start + step];
+
+                let temp_channel = &mut component.raw_coeff;
+
+                // The next logical step is to iterate width wise.
+                // To figure out how many pixels we iterate by we use effective pixels
+                // Given to us by component.x
+                // iterate per effective pixels.
+                let mcu_x = component.width_stride / 8;
+
+                // iterate per every vertical sample.
+                for k in 0..component.vertical_sample {
+                    for j in 0..mcu_x {
+                        // after writing a single stride, we need to skip 8 rows.
+                        // This does the row calculation
+                        let width_stride = k * 8 * component.width_stride;
+                        let start = j * 64 + width_stride;
+
+                        // dequantize
+                        for ((x, out), qt_val) in slice[start..start + 64]
+                            .iter()
+                            .zip(tmp.iter_mut())
+                            .zip(qt_table.iter())
+                        {
+                            *out = i32::from(*x) * qt_val;
+                        }
+                        // determine where to write.
+                        let sl = &mut temp_channel[component.idct_pos..];
+
+                        component.idct_pos += 8;
+                        // tmp now contains a dequantized block so idct it
+                        (self.idct_func)(&mut tmp, sl, component.width_stride);
+                    }
+                    // after every write of 8, skip 7 since idct write stride wise 8 times.
+                    //
+                    // Remember each MCU is 8x8 block, so each idct will write 8 strides into
+                    // sl
+                    //
+                    // and component.idct_pos is one stride long
+                    component.idct_pos += 7 * component.width_stride;
+                }
+                component.idct_pos = 0;
+            }
+
+            // process that width up until it's impossible
+            self.post_process(
+                pixels,
+                i,
+                mcu_height,
+                width,
+                padded_width,
+                &mut pixels_written,
+                &mut upsampler_scratch_space
+            )?;
+        }
+
+        debug!("Finished decoding image");
+
+        return Ok(());
+    }
+    pub(crate) fn reset_params(&mut self) {
+        /*
+        Apparently, grayscale images which can be down sampled exists, which is weird in the sense
+        that it has one component Y, which is not usually down sampled.
+
+        This means some calculations will be wrong, so for that we explicitly reset params
+        for such occurrences, warn and reset the image info to appear as if it were
+        a non-sampled image to ensure decoding works
+        */
+        self.h_max = 1;
+        self.options = self.options.jpeg_set_out_colorspace(ColorSpace::Luma);
+        self.v_max = 1;
+        self.sub_sample_ratio = SampleRatios::None;
+        self.is_interleaved = false;
+        self.components[0].vertical_sample = 1;
+        self.components[0].width_stride = (((self.info.width as usize) + 7) / 8) * 8;
+        self.components[0].horizontal_sample = 1;
+    }
+}
+
+///Get a marker from the bit-stream.
+///
+/// This reads until it gets a marker or end of file is encountered
+fn get_marker<T>(reader: &mut ZReader<T>, stream: &mut BitStream) -> Result<Marker, DecodeErrors>
+where
+    T: ZByteReaderTrait
+{
+    if let Some(marker) = stream.marker {
+        stream.marker = None;
+        return Ok(marker);
+    }
+
+    // read until we get a marker
+
+    while !reader.eof()? {
+        let marker = reader.read_u8_err()?;
+
+        if marker == 255 {
+            let mut r = reader.read_u8_err()?;
+            // 0xFF 0XFF(some images may be like that)
+            while r == 0xFF {
+                r = reader.read_u8_err()?;
+            }
+
+            if r != 0 {
+                return Marker::from_u8(r)
+                    .ok_or_else(|| DecodeErrors::Format(format!("Unknown marker 0xFF{r:X}")));
+            }
+        }
+    }
+    return Err(DecodeErrors::ExhaustedData);
+}
--- a/third_party/zune-jpeg/src/misc.rs
+++ b/third_party/zune-jpeg/src/misc.rs
@ -0,0 +1,431 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+//!Miscellaneous stuff
+#![allow(dead_code)]
+
+use alloc::format;
+use core::cmp::max;
+use core::fmt;
+
+use zune_core::bytestream::ZByteReaderTrait;
+use zune_core::colorspace::ColorSpace;
+use zune_core::log::trace;
+
+use crate::components::{ComponentID, SampleRatios};
+use crate::errors::DecodeErrors;
+use crate::huffman::HuffmanTable;
+use crate::JpegDecoder;
+
+/// Start of baseline DCT Huffman coding
+
+pub const START_OF_FRAME_BASE: u16 = 0xffc0;
+
+/// Start of another frame
+
+pub const START_OF_FRAME_EXT_SEQ: u16 = 0xffc1;
+
+/// Start of progressive DCT encoding
+
+pub const START_OF_FRAME_PROG_DCT: u16 = 0xffc2;
+
+/// Start of Lossless sequential Huffman coding
+
+pub const START_OF_FRAME_LOS_SEQ: u16 = 0xffc3;
+
+/// Start of extended sequential DCT arithmetic coding
+
+pub const START_OF_FRAME_EXT_AR: u16 = 0xffc9;
+
+/// Start of Progressive DCT arithmetic coding
+
+pub const START_OF_FRAME_PROG_DCT_AR: u16 = 0xffca;
+
+/// Start of Lossless sequential Arithmetic coding
+
+pub const START_OF_FRAME_LOS_SEQ_AR: u16 = 0xffcb;
+
+/// Undo run length encoding of coefficients by placing them in natural order
+#[rustfmt::skip]
+pub const UN_ZIGZAG: [usize; 64 + 16] = [
+     0,  1,  8, 16,  9,  2,  3, 10,
+    17, 24, 32, 25, 18, 11,  4,  5,
+    12, 19, 26, 33, 40, 48, 41, 34,
+    27, 20, 13,  6,  7, 14, 21, 28,
+    35, 42, 49, 56, 57, 50, 43, 36,
+    29, 22, 15, 23, 30, 37, 44, 51,
+    58, 59, 52, 45, 38, 31, 39, 46,
+    53, 60, 61, 54, 47, 55, 62, 63,
+    // Prevent overflowing
+    63, 63, 63, 63, 63, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63
+];
+
+/// Align data to a 16 byte boundary
+#[repr(align(16))]
+#[derive(Clone)]
+
+pub struct Aligned16<T: ?Sized>(pub T);
+
+impl<T> Default for Aligned16<T>
+where
+    T: Default
+{
+    fn default() -> Self {
+        Aligned16(T::default())
+    }
+}
+
+/// Align data to a 32 byte boundary
+#[repr(align(32))]
+#[derive(Clone)]
+pub struct Aligned32<T: ?Sized>(pub T);
+
+impl<T> Default for Aligned32<T>
+where
+    T: Default
+{
+    fn default() -> Self {
+        Aligned32(T::default())
+    }
+}
+
+/// Markers that identify different Start of Image markers
+/// They identify the type of encoding and whether the file use lossy(DCT) or
+/// lossless compression and whether we use Huffman or arithmetic coding schemes
+#[derive(Eq, PartialEq, Copy, Clone)]
+#[allow(clippy::upper_case_acronyms)]
+pub enum SOFMarkers {
+    /// Baseline DCT markers
+    BaselineDct,
+    /// SOF_1 Extended sequential DCT,Huffman coding
+    ExtendedSequentialHuffman,
+    /// Progressive DCT, Huffman coding
+    ProgressiveDctHuffman,
+    /// Lossless (sequential), huffman coding,
+    LosslessHuffman,
+    /// Extended sequential DEC, arithmetic coding
+    ExtendedSequentialDctArithmetic,
+    /// Progressive DCT, arithmetic coding,
+    ProgressiveDctArithmetic,
+    /// Lossless ( sequential), arithmetic coding
+    LosslessArithmetic
+}
+
+impl Default for SOFMarkers {
+    fn default() -> Self {
+        Self::BaselineDct
+    }
+}
+
+impl SOFMarkers {
+    /// Check if a certain marker is sequential DCT or not
+
+    pub fn is_sequential_dct(self) -> bool {
+        matches!(
+            self,
+            Self::BaselineDct
+                | Self::ExtendedSequentialHuffman
+                | Self::ExtendedSequentialDctArithmetic
+        )
+    }
+
+    /// Check if a marker is a Lossles type or not
+
+    pub fn is_lossless(self) -> bool {
+        matches!(self, Self::LosslessHuffman | Self::LosslessArithmetic)
+    }
+
+    /// Check whether a marker is a progressive marker or not
+
+    pub fn is_progressive(self) -> bool {
+        matches!(
+            self,
+            Self::ProgressiveDctHuffman | Self::ProgressiveDctArithmetic
+        )
+    }
+
+    /// Create a marker from an integer
+
+    pub fn from_int(int: u16) -> Option<SOFMarkers> {
+        match int {
+            START_OF_FRAME_BASE => Some(Self::BaselineDct),
+            START_OF_FRAME_PROG_DCT => Some(Self::ProgressiveDctHuffman),
+            START_OF_FRAME_PROG_DCT_AR => Some(Self::ProgressiveDctArithmetic),
+            START_OF_FRAME_LOS_SEQ => Some(Self::LosslessHuffman),
+            START_OF_FRAME_LOS_SEQ_AR => Some(Self::LosslessArithmetic),
+            START_OF_FRAME_EXT_SEQ => Some(Self::ExtendedSequentialHuffman),
+            START_OF_FRAME_EXT_AR => Some(Self::ExtendedSequentialDctArithmetic),
+            _ => None
+        }
+    }
+}
+
+impl fmt::Debug for SOFMarkers {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match &self {
+            Self::BaselineDct => write!(f, "Baseline DCT"),
+            Self::ExtendedSequentialHuffman => {
+                write!(f, "Extended sequential DCT, Huffman Coding")
+            }
+            Self::ProgressiveDctHuffman => write!(f, "Progressive DCT,Huffman Encoding"),
+            Self::LosslessHuffman => write!(f, "Lossless (sequential) Huffman encoding"),
+            Self::ExtendedSequentialDctArithmetic => {
+                write!(f, "Extended sequential DCT, arithmetic coding")
+            }
+            Self::ProgressiveDctArithmetic => write!(f, "Progressive DCT, arithmetic coding"),
+            Self::LosslessArithmetic => write!(f, "Lossless (sequential) arithmetic coding")
+        }
+    }
+}
+
+/// Set up component parameters.
+///
+/// This modifies the components in place setting up details needed by other
+/// parts fo the decoder.
+pub(crate) fn setup_component_params<T: ZByteReaderTrait>(
+    img: &mut JpegDecoder<T>
+) -> Result<(), DecodeErrors> {
+    let img_width = img.width();
+    let img_height = img.height();
+
+    // in case of adobe app14 being present, zero may indicate
+    // either CMYK if components are 4 or RGB if components are 3,
+    // see https://docs.oracle.com/javase/6/docs/api/javax/imageio/metadata/doc-files/jpeg_metadata.html
+    // so since we may not know how many number of components
+    // we have when decoding app14, we have to defer that check
+    // until now.
+    //
+    // We know adobe app14 was present since it's the only one that can modify
+    // input colorspace to be CMYK
+    if img.components.len() == 3 && img.input_colorspace == ColorSpace::CMYK {
+        img.input_colorspace = ColorSpace::RGB;
+    }
+
+    for component in &mut img.components {
+        // compute interleaved image info
+        // h_max contains the maximum horizontal component
+        img.h_max = max(img.h_max, component.horizontal_sample);
+        // v_max contains the maximum vertical component
+        img.v_max = max(img.v_max, component.vertical_sample);
+        img.mcu_width = img.h_max * 8;
+        img.mcu_height = img.v_max * 8;
+        // Number of MCU's per width
+        img.mcu_x = (usize::from(img.info.width) + img.mcu_width - 1) / img.mcu_width;
+        // Number of MCU's per height
+        img.mcu_y = (usize::from(img.info.height) + img.mcu_height - 1) / img.mcu_height;
+
+        if img.h_max != 1 || img.v_max != 1 {
+            // interleaved images have horizontal and vertical sampling factors
+            // not equal to 1.
+            img.is_interleaved = true;
+        }
+        // Extract quantization tables from the arrays into components
+        let qt_table = *img.qt_tables[component.quantization_table_number as usize]
+            .as_ref()
+            .ok_or_else(|| {
+                DecodeErrors::DqtError(format!(
+                    "No quantization table for component {:?}",
+                    component.component_id
+                ))
+            })?;
+
+        let x = (usize::from(img_width) * component.horizontal_sample + img.h_max - 1) / img.h_max;
+        let y = (usize::from(img_height) * component.horizontal_sample + img.h_max - 1) / img.v_max;
+        component.x = x;
+        component.w2 = img.mcu_x * component.horizontal_sample * 8;
+        // probably not needed. :)
+        component.y = y;
+        component.quantization_table = qt_table;
+        // initially stride contains its horizontal sub-sampling
+        component.width_stride *= img.mcu_x * 8;
+    }
+    {
+        // Sampling factors are one thing that suck
+        // this fixes a specific problem with images like
+        //
+        // (2 2) None
+        // (2 1) H
+        // (2 1) H
+        //
+        // The images exist in the wild, the images are not meant to exist
+        // but they do, it's just an annoying horizontal sub-sampling that
+        // I don't know why it exists.
+        // But it does
+        // So we try to cope with that.
+        // I am not sure of how to explain how to fix it, but it involved a debugger
+        // and to much coke(the legal one)
+        //
+        // If this wasn't present, self.upsample_dest would have the wrong length
+        let mut handle_that_annoying_bug = false;
+
+        if let Some(y_component) = img
+            .components
+            .iter()
+            .find(|c| c.component_id == ComponentID::Y)
+        {
+            if y_component.horizontal_sample == 2 || y_component.vertical_sample == 2 {
+                handle_that_annoying_bug = true;
+            }
+        }
+        if handle_that_annoying_bug {
+            for comp in &mut img.components {
+                if (comp.component_id != ComponentID::Y)
+                    && (comp.horizontal_sample != 1 || comp.vertical_sample != 1)
+                {
+                    comp.fix_an_annoying_bug = 2;
+                }
+            }
+        }
+    }
+
+    if img.is_mjpeg {
+        fill_default_mjpeg_tables(
+            img.is_progressive,
+            &mut img.dc_huffman_tables,
+            &mut img.ac_huffman_tables
+        );
+    }
+
+    Ok(())
+}
+
+///Calculate number of fill bytes added to the end of a JPEG image
+/// to fill the image
+///
+/// JPEG usually inserts padding bytes if the image width cannot be evenly divided into
+/// 8 , 16 or 32 chunks depending on the sub sampling ratio. So given a sub-sampling ratio,
+/// and the actual width, this calculates the padded bytes that were added to the image
+///
+///  # Params
+/// -actual_width: Actual width of the image
+/// -sub_sample: Sub sampling factor of the image
+///
+/// # Returns
+/// The padded width, this is how long the width is for a particular image
+pub fn calculate_padded_width(actual_width: usize, sub_sample: SampleRatios) -> usize {
+    match sub_sample {
+        SampleRatios::None | SampleRatios::V => {
+            // None+V sends one MCU row, so that's a simple calculation
+            ((actual_width + 7) / 8) * 8
+        }
+        SampleRatios::H | SampleRatios::HV => {
+            // sends two rows, width can be expanded by up to 15 more bytes
+            ((actual_width + 15) / 16) * 16
+        }
+    }
+}
+
+// https://www.loc.gov/preservation/digital/formats/fdd/fdd000063.shtml
+// "Avery Lee, writing in the rec.video.desktop newsgroup in 2001, commented that "MJPEG, or at
+//  least the MJPEG in AVIs having the MJPG fourcc, is restricted JPEG with a fixed -- and
+//  *omitted* -- Huffman table. The JPEG must be YCbCr colorspace, it must be 4:2:2, and it must
+//  use basic Huffman encoding, not arithmetic or progressive.... You can indeed extract the
+//  MJPEG frames and decode them with a regular JPEG decoder, but you have to prepend the DHT
+//  segment to them, or else the decoder won't have any idea how to decompress the data.
+//  The exact table necessary is given in the OpenDML spec.""
+pub fn fill_default_mjpeg_tables(
+    is_progressive: bool, dc_huffman_tables: &mut [Option<HuffmanTable>],
+    ac_huffman_tables: &mut [Option<HuffmanTable>]
+) {
+    // Section K.3.3
+    trace!("Filling with default mjpeg tables");
+
+    if dc_huffman_tables[0].is_none() {
+        // Table K.3
+        dc_huffman_tables[0] = Some(
+            HuffmanTable::new_unfilled(
+                &[
+                    0x00, 0x00, 0x01, 0x05, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
+                    0x00, 0x00, 0x00, 0x00
+                ],
+                &[
+                    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B
+                ],
+                true,
+                is_progressive
+            )
+            .unwrap()
+        );
+    }
+    if dc_huffman_tables[1].is_none() {
+        // Table K.4
+        dc_huffman_tables[1] = Some(
+            HuffmanTable::new_unfilled(
+                &[
+                    0x00, 0x00, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00,
+                    0x00, 0x00, 0x00, 0x00
+                ],
+                &[
+                    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B
+                ],
+                true,
+                is_progressive
+            )
+            .unwrap()
+        );
+    }
+    if ac_huffman_tables[0].is_none() {
+        // Table K.5
+        ac_huffman_tables[0] = Some(
+            HuffmanTable::new_unfilled(
+                &[
+                    0x00, 0x00, 0x02, 0x01, 0x03, 0x03, 0x02, 0x04, 0x03, 0x05, 0x05, 0x04, 0x04,
+                    0x00, 0x00, 0x01, 0x7D
+                ],
+                &[
+                    0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, 0x21, 0x31, 0x41, 0x06, 0x13,
+                    0x51, 0x61, 0x07, 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xA1, 0x08, 0x23, 0x42,
+                    0xB1, 0xC1, 0x15, 0x52, 0xD1, 0xF0, 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0A,
+                    0x16, 0x17, 0x18, 0x19, 0x1A, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x34, 0x35,
+                    0x36, 0x37, 0x38, 0x39, 0x3A, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A,
+                    0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x63, 0x64, 0x65, 0x66, 0x67,
+                    0x68, 0x69, 0x6A, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x83, 0x84,
+                    0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
+                    0x99, 0x9A, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xB2, 0xB3,
+                    0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
+                    0xC8, 0xC9, 0xCA, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xE1,
+                    0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xF1, 0xF2, 0xF3, 0xF4,
+                    0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA
+                ],
+                false,
+                is_progressive
+            )
+            .unwrap()
+        );
+    }
+    if ac_huffman_tables[1].is_none() {
+        // Table K.6
+        ac_huffman_tables[1] = Some(
+            HuffmanTable::new_unfilled(
+                &[
+                    0x00, 0x00, 0x02, 0x01, 0x02, 0x04, 0x04, 0x03, 0x04, 0x07, 0x05, 0x04, 0x04,
+                    0x00, 0x01, 0x02, 0x77
+                ],
+                &[
+                    0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21, 0x31, 0x06, 0x12, 0x41, 0x51,
+                    0x07, 0x61, 0x71, 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91, 0xA1, 0xB1,
+                    0xC1, 0x09, 0x23, 0x33, 0x52, 0xF0, 0x15, 0x62, 0x72, 0xD1, 0x0A, 0x16, 0x24,
+                    0x34, 0xE1, 0x25, 0xF1, 0x17, 0x18, 0x19, 0x1A, 0x26, 0x27, 0x28, 0x29, 0x2A,
+                    0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
+                    0x4A, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x63, 0x64, 0x65, 0x66,
+                    0x67, 0x68, 0x69, 0x6A, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x82,
+                    0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x92, 0x93, 0x94, 0x95, 0x96,
+                    0x97, 0x98, 0x99, 0x9A, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA,
+                    0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xC2, 0xC3, 0xC4, 0xC5,
+                    0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9,
+                    0xDA, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xF2, 0xF3, 0xF4,
+                    0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA
+                ],
+                false,
+                is_progressive
+            )
+            .unwrap()
+        );
+    }
+}
--- a/third_party/zune-jpeg/src/unsafe_utils.rs
+++ b/third_party/zune-jpeg/src/unsafe_utils.rs
@ -0,0 +1,4 @@
+#[cfg(all(feature = "x86", any(target_arch = "x86", target_arch = "x86_64")))]
+pub use crate::unsafe_utils_avx2::*;
+#[cfg(all(feature = "neon", target_arch = "aarch64"))]
+pub use crate::unsafe_utils_neon::*;
--- a/third_party/zune-jpeg/src/unsafe_utils_avx2.rs
+++ b/third_party/zune-jpeg/src/unsafe_utils_avx2.rs
@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+#![cfg(all(feature = "x86", any(target_arch = "x86", target_arch = "x86_64")))]
+//! This module provides unsafe ways to do some things
+#![allow(clippy::wildcard_imports)]
+
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+use core::ops::{Add, AddAssign, Mul, MulAssign, Sub};
+
+/// A copy of `_MM_SHUFFLE()` that doesn't require
+/// a nightly compiler
+#[inline]
+const fn shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
+    (z << 6) | (y << 4) | (x << 2) | w
+}
+
+/// An abstraction of an AVX ymm register that
+///allows some things to not look ugly
+#[derive(Clone, Copy)]
+pub struct YmmRegister {
+    /// An AVX register
+    pub(crate) mm256: __m256i
+}
+
+impl Add for YmmRegister {
+    type Output = YmmRegister;
+
+    #[inline]
+    fn add(self, rhs: Self) -> Self::Output {
+        unsafe {
+            return YmmRegister {
+                mm256: _mm256_add_epi32(self.mm256, rhs.mm256)
+            };
+        }
+    }
+}
+
+impl Add<i32> for YmmRegister {
+    type Output = YmmRegister;
+
+    #[inline]
+    fn add(self, rhs: i32) -> Self::Output {
+        unsafe {
+            let tmp = _mm256_set1_epi32(rhs);
+
+            return YmmRegister {
+                mm256: _mm256_add_epi32(self.mm256, tmp)
+            };
+        }
+    }
+}
+
+impl Sub for YmmRegister {
+    type Output = YmmRegister;
+
+    #[inline]
+    fn sub(self, rhs: Self) -> Self::Output {
+        unsafe {
+            return YmmRegister {
+                mm256: _mm256_sub_epi32(self.mm256, rhs.mm256)
+            };
+        }
+    }
+}
+
+impl AddAssign for YmmRegister {
+    #[inline]
+    fn add_assign(&mut self, rhs: Self) {
+        unsafe {
+            self.mm256 = _mm256_add_epi32(self.mm256, rhs.mm256);
+        }
+    }
+}
+
+impl AddAssign<i32> for YmmRegister {
+    #[inline]
+    fn add_assign(&mut self, rhs: i32) {
+        unsafe {
+            let tmp = _mm256_set1_epi32(rhs);
+
+            self.mm256 = _mm256_add_epi32(self.mm256, tmp);
+        }
+    }
+}
+
+impl Mul for YmmRegister {
+    type Output = YmmRegister;
+
+    #[inline]
+    fn mul(self, rhs: Self) -> Self::Output {
+        unsafe {
+            YmmRegister {
+                mm256: _mm256_mullo_epi32(self.mm256, rhs.mm256)
+            }
+        }
+    }
+}
+
+impl Mul<i32> for YmmRegister {
+    type Output = YmmRegister;
+
+    #[inline]
+    fn mul(self, rhs: i32) -> Self::Output {
+        unsafe {
+            let tmp = _mm256_set1_epi32(rhs);
+
+            YmmRegister {
+                mm256: _mm256_mullo_epi32(self.mm256, tmp)
+            }
+        }
+    }
+}
+
+impl MulAssign for YmmRegister {
+    #[inline]
+    fn mul_assign(&mut self, rhs: Self) {
+        unsafe {
+            self.mm256 = _mm256_mullo_epi32(self.mm256, rhs.mm256);
+        }
+    }
+}
+
+impl MulAssign<i32> for YmmRegister {
+    #[inline]
+    fn mul_assign(&mut self, rhs: i32) {
+        unsafe {
+            let tmp = _mm256_set1_epi32(rhs);
+
+            self.mm256 = _mm256_mullo_epi32(self.mm256, tmp);
+        }
+    }
+}
+
+impl MulAssign<__m256i> for YmmRegister {
+    #[inline]
+    fn mul_assign(&mut self, rhs: __m256i) {
+        unsafe {
+            self.mm256 = _mm256_mullo_epi32(self.mm256, rhs);
+        }
+    }
+}
+
+type Reg = YmmRegister;
+
+/// Transpose an array of 8 by 8 i32's using avx intrinsics
+///
+/// This was translated from [here](https://newbedev.com/transpose-an-8x8-float-using-avx-avx2)
+#[allow(unused_parens, clippy::too_many_arguments)]
+#[target_feature(enable = "avx2")]
+#[inline]
+pub unsafe fn transpose(
+    v0: &mut Reg, v1: &mut Reg, v2: &mut Reg, v3: &mut Reg, v4: &mut Reg, v5: &mut Reg,
+    v6: &mut Reg, v7: &mut Reg
+) {
+    macro_rules! merge_epi32 {
+        ($v0:tt,$v1:tt,$v2:tt,$v3:tt) => {
+            let va = _mm256_permute4x64_epi64($v0, shuffle(3, 1, 2, 0));
+
+            let vb = _mm256_permute4x64_epi64($v1, shuffle(3, 1, 2, 0));
+
+            $v2 = _mm256_unpacklo_epi32(va, vb);
+
+            $v3 = _mm256_unpackhi_epi32(va, vb);
+        };
+    }
+
+    macro_rules! merge_epi64 {
+        ($v0:tt,$v1:tt,$v2:tt,$v3:tt) => {
+            let va = _mm256_permute4x64_epi64($v0, shuffle(3, 1, 2, 0));
+
+            let vb = _mm256_permute4x64_epi64($v1, shuffle(3, 1, 2, 0));
+
+            $v2 = _mm256_unpacklo_epi64(va, vb);
+
+            $v3 = _mm256_unpackhi_epi64(va, vb);
+        };
+    }
+
+    macro_rules! merge_si128 {
+        ($v0:tt,$v1:tt,$v2:tt,$v3:tt) => {
+            $v2 = _mm256_permute2x128_si256($v0, $v1, shuffle(0, 2, 0, 0));
+
+            $v3 = _mm256_permute2x128_si256($v0, $v1, shuffle(0, 3, 0, 1));
+        };
+    }
+
+    let (w0, w1, w2, w3, w4, w5, w6, w7);
+
+    merge_epi32!((v0.mm256), (v1.mm256), w0, w1);
+
+    merge_epi32!((v2.mm256), (v3.mm256), w2, w3);
+
+    merge_epi32!((v4.mm256), (v5.mm256), w4, w5);
+
+    merge_epi32!((v6.mm256), (v7.mm256), w6, w7);
+
+    let (x0, x1, x2, x3, x4, x5, x6, x7);
+
+    merge_epi64!(w0, w2, x0, x1);
+
+    merge_epi64!(w1, w3, x2, x3);
+
+    merge_epi64!(w4, w6, x4, x5);
+
+    merge_epi64!(w5, w7, x6, x7);
+
+    merge_si128!(x0, x4, (v0.mm256), (v1.mm256));
+
+    merge_si128!(x1, x5, (v2.mm256), (v3.mm256));
+
+    merge_si128!(x2, x6, (v4.mm256), (v5.mm256));
+
+    merge_si128!(x3, x7, (v6.mm256), (v7.mm256));
+}
--- a/third_party/zune-jpeg/src/unsafe_utils_neon.rs
+++ b/third_party/zune-jpeg/src/unsafe_utils_neon.rs
@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+#![cfg(target_arch = "aarch64")]
+// TODO can this be extended to armv7
+
+//! This module provides unsafe ways to do some things
+#![allow(clippy::wildcard_imports)]
+
+use std::arch::aarch64::*;
+use std::ops::{Add, AddAssign, BitOr, BitOrAssign, Mul, MulAssign, Sub};
+
+pub type VecType = int32x4x2_t;
+
+pub unsafe fn loadu(src: *const i32) -> VecType {
+    vld1q_s32_x2(src as *const _)
+}
+
+/// An abstraction of an AVX ymm register that
+///allows some things to not look ugly
+#[derive(Clone, Copy)]
+pub struct YmmRegister {
+    /// An AVX register
+    pub(crate) mm256: VecType
+}
+
+impl YmmRegister {
+    #[inline]
+    pub unsafe fn load(src: *const i32) -> Self {
+        loadu(src).into()
+    }
+
+    #[inline]
+    pub fn map2(self, other: Self, f: impl Fn(int32x4_t, int32x4_t) -> int32x4_t) -> Self {
+        let m0 = f(self.mm256.0, other.mm256.0);
+        let m1 = f(self.mm256.1, other.mm256.1);
+
+        YmmRegister {
+            mm256: int32x4x2_t(m0, m1)
+        }
+    }
+
+    #[inline]
+    pub fn all_zero(self) -> bool {
+        unsafe {
+            let both = vorrq_s32(self.mm256.0, self.mm256.1);
+            let both_unsigned = vreinterpretq_u32_s32(both);
+            0 == vmaxvq_u32(both_unsigned)
+        }
+    }
+
+    #[inline]
+    pub fn const_shl<const N: i32>(self) -> Self {
+        // Ensure that we logically shift left
+        unsafe {
+            let m0 = vreinterpretq_s32_u32(vshlq_n_u32::<N>(vreinterpretq_u32_s32(self.mm256.0)));
+            let m1 = vreinterpretq_s32_u32(vshlq_n_u32::<N>(vreinterpretq_u32_s32(self.mm256.1)));
+
+            YmmRegister {
+                mm256: int32x4x2_t(m0, m1)
+            }
+        }
+    }
+
+    #[inline]
+    pub fn const_shra<const N: i32>(self) -> Self {
+        unsafe {
+            let i0 = vshrq_n_s32::<N>(self.mm256.0);
+            let i1 = vshrq_n_s32::<N>(self.mm256.1);
+
+            YmmRegister {
+                mm256: int32x4x2_t(i0, i1)
+            }
+        }
+    }
+}
+
+impl<T> Add<T> for YmmRegister
+where
+    T: Into<Self>
+{
+    type Output = YmmRegister;
+
+    #[inline]
+    fn add(self, rhs: T) -> Self::Output {
+        let rhs = rhs.into();
+        unsafe { self.map2(rhs, |a, b| vaddq_s32(a, b)) }
+    }
+}
+
+impl<T> Sub<T> for YmmRegister
+where
+    T: Into<Self>
+{
+    type Output = YmmRegister;
+
+    #[inline]
+    fn sub(self, rhs: T) -> Self::Output {
+        let rhs = rhs.into();
+        unsafe { self.map2(rhs, |a, b| vsubq_s32(a, b)) }
+    }
+}
+
+impl<T> AddAssign<T> for YmmRegister
+where
+    T: Into<Self>
+{
+    #[inline]
+    fn add_assign(&mut self, rhs: T) {
+        let rhs: Self = rhs.into();
+        *self = *self + rhs;
+    }
+}
+
+impl<T> Mul<T> for YmmRegister
+where
+    T: Into<Self>
+{
+    type Output = YmmRegister;
+
+    #[inline]
+    fn mul(self, rhs: T) -> Self::Output {
+        let rhs = rhs.into();
+        unsafe { self.map2(rhs, |a, b| vmulq_s32(a, b)) }
+    }
+}
+
+impl<T> MulAssign<T> for YmmRegister
+where
+    T: Into<Self>
+{
+    #[inline]
+    fn mul_assign(&mut self, rhs: T) {
+        let rhs: Self = rhs.into();
+        *self = *self * rhs;
+    }
+}
+
+impl<T> BitOr<T> for YmmRegister
+where
+    T: Into<Self>
+{
+    type Output = YmmRegister;
+
+    #[inline]
+    fn bitor(self, rhs: T) -> Self::Output {
+        let rhs = rhs.into();
+        unsafe { self.map2(rhs, |a, b| vorrq_s32(a, b)) }
+    }
+}
+
+impl<T> BitOrAssign<T> for YmmRegister
+where
+    T: Into<Self>
+{
+    #[inline]
+    fn bitor_assign(&mut self, rhs: T) {
+        let rhs: Self = rhs.into();
+        *self = *self | rhs;
+    }
+}
+
+impl From<i32> for YmmRegister {
+    #[inline]
+    fn from(val: i32) -> Self {
+        unsafe {
+            let dup = vdupq_n_s32(val);
+
+            YmmRegister {
+                mm256: int32x4x2_t(dup, dup)
+            }
+        }
+    }
+}
+
+impl From<VecType> for YmmRegister {
+    #[inline]
+    fn from(mm256: VecType) -> Self {
+        YmmRegister { mm256 }
+    }
+}
+
+#[allow(clippy::too_many_arguments)]
+#[inline]
+unsafe fn transpose4(
+    v0: &mut int32x4_t, v1: &mut int32x4_t, v2: &mut int32x4_t, v3: &mut int32x4_t
+) {
+    let w0 = vtrnq_s32(
+        vreinterpretq_s32_s64(vtrn1q_s64(
+            vreinterpretq_s64_s32(*v0),
+            vreinterpretq_s64_s32(*v2)
+        )),
+        vreinterpretq_s32_s64(vtrn1q_s64(
+            vreinterpretq_s64_s32(*v1),
+            vreinterpretq_s64_s32(*v3)
+        ))
+    );
+    let w1 = vtrnq_s32(
+        vreinterpretq_s32_s64(vtrn2q_s64(
+            vreinterpretq_s64_s32(*v0),
+            vreinterpretq_s64_s32(*v2)
+        )),
+        vreinterpretq_s32_s64(vtrn2q_s64(
+            vreinterpretq_s64_s32(*v1),
+            vreinterpretq_s64_s32(*v3)
+        ))
+    );
+
+    *v0 = w0.0;
+    *v1 = w0.1;
+    *v2 = w1.0;
+    *v3 = w1.1;
+}
+
+/// Transpose an array of 8 by 8 i32
+/// Arm has dedicated interleave/transpose instructions
+/// we:
+/// 1. Transpose the upper left and lower right quadrants
+/// 2. Swap and transpose the upper right and lower left quadrants
+#[allow(clippy::too_many_arguments)]
+#[inline]
+pub unsafe fn transpose(
+    v0: &mut YmmRegister, v1: &mut YmmRegister, v2: &mut YmmRegister, v3: &mut YmmRegister,
+    v4: &mut YmmRegister, v5: &mut YmmRegister, v6: &mut YmmRegister, v7: &mut YmmRegister
+) {
+    use std::mem::swap;
+
+    let ul0 = &mut v0.mm256.0;
+    let ul1 = &mut v1.mm256.0;
+    let ul2 = &mut v2.mm256.0;
+    let ul3 = &mut v3.mm256.0;
+
+    let ur0 = &mut v0.mm256.1;
+    let ur1 = &mut v1.mm256.1;
+    let ur2 = &mut v2.mm256.1;
+    let ur3 = &mut v3.mm256.1;
+
+    let ll0 = &mut v4.mm256.0;
+    let ll1 = &mut v5.mm256.0;
+    let ll2 = &mut v6.mm256.0;
+    let ll3 = &mut v7.mm256.0;
+
+    let lr0 = &mut v4.mm256.1;
+    let lr1 = &mut v5.mm256.1;
+    let lr2 = &mut v6.mm256.1;
+    let lr3 = &mut v7.mm256.1;
+
+    swap(ur0, ll0);
+    swap(ur1, ll1);
+    swap(ur2, ll2);
+    swap(ur3, ll3);
+
+    transpose4(ul0, ul1, ul2, ul3);
+
+    transpose4(ur0, ur1, ur2, ur3);
+
+    transpose4(ll0, ll1, ll2, ll3);
+
+    transpose4(lr0, lr1, lr2, lr3);
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_transpose() {
+        fn get_val(i: usize, j: usize) -> i32 {
+            ((i * 8) / (j + 1)) as i32
+        }
+        unsafe {
+            let mut vals: [i32; 8 * 8] = [0; 8 * 8];
+
+            for i in 0..8 {
+                for j in 0..8 {
+                    // some order-dependent value of i and j
+                    let value = get_val(i, j);
+                    vals[i * 8 + j] = value;
+                }
+            }
+
+            let mut regs: [YmmRegister; 8] = std::mem::transmute(vals);
+            let mut reg0 = regs[0];
+            let mut reg1 = regs[1];
+            let mut reg2 = regs[2];
+            let mut reg3 = regs[3];
+            let mut reg4 = regs[4];
+            let mut reg5 = regs[5];
+            let mut reg6 = regs[6];
+            let mut reg7 = regs[7];
+
+            transpose(
+                &mut reg0, &mut reg1, &mut reg2, &mut reg3, &mut reg4, &mut reg5, &mut reg6,
+                &mut reg7
+            );
+
+            regs[0] = reg0;
+            regs[1] = reg1;
+            regs[2] = reg2;
+            regs[3] = reg3;
+            regs[4] = reg4;
+            regs[5] = reg5;
+            regs[6] = reg6;
+            regs[7] = reg7;
+
+            let vals_from_reg: [i32; 8 * 8] = std::mem::transmute(regs);
+
+            for i in 0..8 {
+                for j in 0..i {
+                    let orig = vals[i * 8 + j];
+                    vals[i * 8 + j] = vals[j * 8 + i];
+                    vals[j * 8 + i] = orig;
+                }
+            }
+
+            for i in 0..8 {
+                for j in 0..8 {
+                    assert_eq!(vals[j * 8 + i], get_val(i, j));
+                    assert_eq!(vals_from_reg[j * 8 + i], get_val(i, j));
+                }
+            }
+
+            assert_eq!(vals, vals_from_reg);
+        }
+    }
+}
--- a/third_party/zune-jpeg/src/upsampler.rs
+++ b/third_party/zune-jpeg/src/upsampler.rs
@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+//! Up-sampling routines
+//!
+//! The main upsampling method is a bi-linear interpolation or a "triangle
+//! filter " or libjpeg turbo `fancy_upsampling` which is a good compromise
+//! between speed and visual quality
+//!
+//! # The filter
+//! Each output pixel is made from `(3*A+B)/4` where A is the original
+//! pixel closer to the output and B is the one further.
+//!
+//! ```text
+//!+---+---+
+//! | A | B |
+//! +---+---+
+//! +-+-+-+-+
+//! | |P| | |
+//! +-+-+-+-+
+//! ```
+//!
+//! # Horizontal Bi-linear filter
+//! ```text
+//! |---+-----------+---+
+//! |   |           |   |
+//! | A | |p1 | p2| | B |
+//! |   |           |   |
+//! |---+-----------+---+
+//!
+//! ```
+//! For a horizontal bi-linear it's trivial to implement,
+//!
+//! `A` becomes the input closest to the output.
+//!
+//! `B` varies depending on output.
+//!  - For odd positions, input is the `next` pixel after A
+//!  - For even positions, input is the `previous` value before A.
+//!
+//! We iterate in a classic 1-D sliding window with a window of 3.
+//! For our sliding window approach, `A` is the 1st and `B` is either the 0th term or 2nd term
+//! depending on position we are writing.(see scalar code).
+//!
+//! For vector code see module sse for explanation.
+//!
+//! # Vertical bi-linear.
+//! Vertical up-sampling is a bit trickier.
+//!
+//! ```text
+//! +----+----+
+//! | A1 | A2 |
+//! +----+----+
+//! +----+----+
+//! | p1 | p2 |
+//! +----+-+--+
+//! +----+-+--+
+//! | p3 | p4 |
+//! +----+-+--+
+//! +----+----+
+//! | B1 | B2 |
+//! +----+----+
+//! ```
+//!
+//! For `p1`
+//! - `A1` is given a weight of `3` and `B1` is given a weight of 1.
+//!
+//! For `p3`
+//! - `B1` is given a weight of `3` and `A1` is given a weight of 1
+//!
+//! # Horizontal vertical downsampling/chroma quartering.
+//!
+//! Carry out a vertical filter in the first pass, then a horizontal filter in the second pass.
+use crate::components::UpSampler;
+
+mod scalar;
+
+// choose best possible implementation for this platform
+pub fn choose_horizontal_samp_function(_use_unsafe: bool) -> UpSampler {
+    return scalar::upsample_horizontal;
+}
+
+pub fn choose_hv_samp_function(_use_unsafe: bool) -> UpSampler {
+    return scalar::upsample_hv;
+}
+
+pub fn choose_v_samp_function(_use_unsafe: bool) -> UpSampler {
+    return scalar::upsample_vertical;
+}
+
+/// Upsample nothing
+
+pub fn upsample_no_op(
+    _input: &[i16], _in_ref: &[i16], _in_near: &[i16], _scratch_space: &mut [i16],
+    _output: &mut [i16]
+) {
+}
--- a/third_party/zune-jpeg/src/upsampler/scalar.rs
+++ b/third_party/zune-jpeg/src/upsampler/scalar.rs
@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+pub fn upsample_horizontal(
+    input: &[i16], _ref: &[i16], _in_near: &[i16], _scratch: &mut [i16], output: &mut [i16]
+) {
+    assert_eq!(
+        input.len() * 2,
+        output.len(),
+        "Input length is not half the size of the output length"
+    );
+    assert!(
+        output.len() > 4 && input.len() > 2,
+        "Too Short of a vector, cannot upsample"
+    );
+
+    output[0] = input[0];
+    output[1] = (input[0] * 3 + input[1] + 2) >> 2;
+
+    // This code is written for speed and not readability
+    //
+    // The readable code is
+    //
+    //      for i in 1..input.len() - 1{
+    //         let sample = 3 * input[i] + 2;
+    //         out[i * 2] = (sample + input[i - 1]) >> 2;
+    //         out[i * 2 + 1] = (sample + input[i + 1]) >> 2;
+    //     }
+    //
+    // The output of a pixel is determined by it's surrounding neighbours but we attach more weight to it's nearest
+    // neighbour (input[i]) than to the next nearest neighbour.
+
+    for (output_window, input_window) in output[2..].chunks_exact_mut(2).zip(input.windows(3)) {
+        let sample = 3 * input_window[1] + 2;
+
+        output_window[0] = (sample + input_window[0]) >> 2;
+        output_window[1] = (sample + input_window[2]) >> 2;
+    }
+    // Get lengths
+    let out_len = output.len() - 2;
+    let input_len = input.len() - 2;
+
+    // slice the output vector
+    let f_out = &mut output[out_len..];
+    let i_last = &input[input_len..];
+
+    // write out manually..
+    f_out[0] = (3 * i_last[0] + i_last[1] + 2) >> 2;
+    f_out[1] = i_last[1];
+}
+pub fn upsample_vertical(
+    input: &[i16], in_near: &[i16], in_far: &[i16], _scratch_space: &mut [i16], output: &mut [i16]
+) {
+    assert_eq!(input.len() * 2, output.len());
+    assert_eq!(in_near.len(), input.len());
+    assert_eq!(in_far.len(), input.len());
+
+    let middle = output.len() / 2;
+
+    let (out_top, out_bottom) = output.split_at_mut(middle);
+
+    // for the first row, closest row is in_near
+    for ((near, far), x) in input.iter().zip(in_near.iter()).zip(out_top) {
+        *x = (((3 * near) + 2) + far) >> 2;
+    }
+    // for the second row, the closest row to input is in_far
+    for ((near, far), x) in input.iter().zip(in_far.iter()).zip(out_bottom) {
+        *x = (((3 * near) + 2) + far) >> 2;
+    }
+}
+
+pub fn upsample_hv(
+    input: &[i16], in_near: &[i16], in_far: &[i16], scratch_space: &mut [i16], output: &mut [i16]
+) {
+    assert_eq!(input.len() * 4, output.len());
+
+    let mut t = [0];
+    upsample_vertical(input, in_near, in_far, &mut t, scratch_space);
+    // horizontal upsampling must be done separate for every line
+    // Otherwise it introduces artifacts that may cause the edge colors
+    // to appear on the other line.
+
+    // Since this is called for two scanlines/widths currently
+    // splitting the inputs and outputs into half ensures we only handle
+    // one scanline per iteration
+    let scratch_half = scratch_space.len() / 2;
+
+    let output_half = output.len() / 2;
+
+    upsample_horizontal(
+        &scratch_space[..scratch_half],
+        &[],
+        &[],
+        &mut t,
+        &mut output[..output_half]
+    );
+
+    upsample_horizontal(
+        &scratch_space[scratch_half..],
+        &[],
+        &[],
+        &mut t,
+        &mut output[output_half..]
+    );
+}
--- a/third_party/zune-jpeg/src/worker.rs
+++ b/third_party/zune-jpeg/src/worker.rs
@ -0,0 +1,429 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+use alloc::format;
+use core::convert::TryInto;
+
+use zune_core::colorspace::ColorSpace;
+
+use crate::color_convert::ycbcr_to_grayscale;
+use crate::components::{Components, SampleRatios};
+use crate::decoder::{ColorConvert16Ptr, MAX_COMPONENTS};
+use crate::errors::DecodeErrors;
+
+/// fast 0..255 * 0..255 => 0..255 rounded multiplication
+///
+/// Borrowed from stb
+#[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)]
+#[inline]
+fn blinn_8x8(in_val: u8, y: u8) -> u8 {
+    let t = i32::from(in_val) * i32::from(y) + 128;
+    return ((t + (t >> 8)) >> 8) as u8;
+}
+
+#[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)]
+pub(crate) fn color_convert(
+    unprocessed: &[&[i16]; MAX_COMPONENTS], color_convert_16: ColorConvert16Ptr,
+    input_colorspace: ColorSpace, output_colorspace: ColorSpace, output: &mut [u8], width: usize,
+    padded_width: usize
+) -> Result<(), DecodeErrors> // so many parameters..
+{
+    // maximum sampling factors are in Y-channel, no need to pass them.
+
+    if input_colorspace.num_components() == 3 && input_colorspace == output_colorspace {
+        // sort things like RGB to RGB conversion
+        copy_removing_padding(unprocessed, width, padded_width, output);
+        return Ok(());
+    }
+    if input_colorspace.num_components() == 4 && input_colorspace == output_colorspace {
+        copy_removing_padding_4x(unprocessed, width, padded_width, output);
+        return Ok(());
+    }
+    // color convert
+    match (input_colorspace, output_colorspace) {
+        (ColorSpace::YCbCr | ColorSpace::Luma, ColorSpace::Luma) => {
+            ycbcr_to_grayscale(unprocessed[0], width, padded_width, output);
+        }
+        (
+            ColorSpace::YCbCr,
+            ColorSpace::RGB | ColorSpace::RGBA | ColorSpace::BGR | ColorSpace::BGRA
+        ) => {
+            color_convert_ycbcr(
+                unprocessed,
+                width,
+                padded_width,
+                output_colorspace,
+                color_convert_16,
+                output
+            );
+        }
+        (ColorSpace::YCCK, ColorSpace::RGB) => {
+            color_convert_ycck_to_rgb::<3>(
+                unprocessed,
+                width,
+                padded_width,
+                output_colorspace,
+                color_convert_16,
+                output
+            );
+        }
+
+        (ColorSpace::YCCK, ColorSpace::RGBA) => {
+            color_convert_ycck_to_rgb::<4>(
+                unprocessed,
+                width,
+                padded_width,
+                output_colorspace,
+                color_convert_16,
+                output
+            );
+        }
+        (ColorSpace::CMYK, ColorSpace::RGB) => {
+            color_convert_cymk_to_rgb::<3>(unprocessed, width, padded_width, output);
+        }
+        (ColorSpace::CMYK, ColorSpace::RGBA) => {
+            color_convert_cymk_to_rgb::<4>(unprocessed, width, padded_width, output);
+        }
+        // For the other components we do nothing(currently)
+        _ => {
+            let msg = format!(
+                    "Unimplemented colorspace mapping from {input_colorspace:?} to {output_colorspace:?}");
+
+            return Err(DecodeErrors::Format(msg));
+        }
+    }
+    Ok(())
+}
+
+/// Copy a block to output removing padding bytes from input
+/// if necessary
+#[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)]
+fn copy_removing_padding(
+    mcu_block: &[&[i16]; MAX_COMPONENTS], width: usize, padded_width: usize, output: &mut [u8]
+) {
+    for (((pix_w, c_w), m_w), y_w) in output
+        .chunks_exact_mut(width * 3)
+        .zip(mcu_block[0].chunks_exact(padded_width))
+        .zip(mcu_block[1].chunks_exact(padded_width))
+        .zip(mcu_block[2].chunks_exact(padded_width))
+    {
+        for (((pix, c), y), m) in pix_w.chunks_exact_mut(3).zip(c_w).zip(m_w).zip(y_w) {
+            pix[0] = *c as u8;
+            pix[1] = *y as u8;
+            pix[2] = *m as u8;
+        }
+    }
+}
+#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
+fn copy_removing_padding_4x(
+    mcu_block: &[&[i16]; MAX_COMPONENTS], width: usize, padded_width: usize, output: &mut [u8]
+) {
+    for ((((pix_w, c_w), m_w), y_w), k_w) in output
+        .chunks_exact_mut(width * 4)
+        .zip(mcu_block[0].chunks_exact(padded_width))
+        .zip(mcu_block[1].chunks_exact(padded_width))
+        .zip(mcu_block[2].chunks_exact(padded_width))
+        .zip(mcu_block[3].chunks_exact(padded_width))
+    {
+        for ((((pix, c), y), m), k) in pix_w
+            .chunks_exact_mut(4)
+            .zip(c_w)
+            .zip(m_w)
+            .zip(y_w)
+            .zip(k_w)
+        {
+            pix[0] = *c as u8;
+            pix[1] = *y as u8;
+            pix[2] = *m as u8;
+            pix[3] = *k as u8;
+        }
+    }
+}
+/// Convert YCCK image to rgb
+#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
+fn color_convert_ycck_to_rgb<const NUM_COMPONENTS: usize>(
+    mcu_block: &[&[i16]; MAX_COMPONENTS], width: usize, padded_width: usize,
+    output_colorspace: ColorSpace, color_convert_16: ColorConvert16Ptr, output: &mut [u8]
+) {
+    color_convert_ycbcr(
+        mcu_block,
+        width,
+        padded_width,
+        output_colorspace,
+        color_convert_16,
+        output
+    );
+    for (pix_w, m_w) in output
+        .chunks_exact_mut(width * 3)
+        .zip(mcu_block[3].chunks_exact(padded_width))
+    {
+        for (pix, m) in pix_w.chunks_exact_mut(NUM_COMPONENTS).zip(m_w) {
+            let m = (*m) as u8;
+            pix[0] = blinn_8x8(255 - pix[0], m);
+            pix[1] = blinn_8x8(255 - pix[1], m);
+            pix[2] = blinn_8x8(255 - pix[2], m);
+        }
+    }
+}
+
+#[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)]
+fn color_convert_cymk_to_rgb<const NUM_COMPONENTS: usize>(
+    mcu_block: &[&[i16]; MAX_COMPONENTS], width: usize, padded_width: usize, output: &mut [u8]
+) {
+    for ((((pix_w, c_w), m_w), y_w), k_w) in output
+        .chunks_exact_mut(width * NUM_COMPONENTS)
+        .zip(mcu_block[0].chunks_exact(padded_width))
+        .zip(mcu_block[1].chunks_exact(padded_width))
+        .zip(mcu_block[2].chunks_exact(padded_width))
+        .zip(mcu_block[3].chunks_exact(padded_width))
+    {
+        for ((((pix, c), m), y), k) in pix_w
+            .chunks_exact_mut(3)
+            .zip(c_w)
+            .zip(m_w)
+            .zip(y_w)
+            .zip(k_w)
+        {
+            let c = *c as u8;
+            let m = *m as u8;
+            let y = *y as u8;
+            let k = *k as u8;
+
+            pix[0] = blinn_8x8(c, k);
+            pix[1] = blinn_8x8(m, k);
+            pix[2] = blinn_8x8(y, k);
+        }
+    }
+}
+
+/// Do color-conversion for interleaved MCU
+#[allow(
+    clippy::similar_names,
+    clippy::too_many_arguments,
+    clippy::needless_pass_by_value,
+    clippy::unwrap_used
+)]
+fn color_convert_ycbcr(
+    mcu_block: &[&[i16]; MAX_COMPONENTS], width: usize, padded_width: usize,
+    output_colorspace: ColorSpace, color_convert_16: ColorConvert16Ptr, output: &mut [u8]
+) {
+    let num_components = output_colorspace.num_components();
+
+    let stride = width * num_components;
+    // Allocate temporary buffer for small widths less than  16.
+    let mut temp = [0; 64];
+    // We need to chunk per width to ensure we can discard extra values at the end of the width.
+    // Since the encoder may pad bits to ensure the width is a multiple of 8.
+    for (((y_width, cb_width), cr_width), out) in mcu_block[0]
+        .chunks_exact(padded_width)
+        .zip(mcu_block[1].chunks_exact(padded_width))
+        .zip(mcu_block[2].chunks_exact(padded_width))
+        .zip(output.chunks_exact_mut(stride))
+    {
+        if width < 16 {
+            // allocate temporary buffers for the values received from idct
+            let mut y_out = [0; 16];
+            let mut cb_out = [0; 16];
+            let mut cr_out = [0; 16];
+            // copy those small widths to that buffer
+            y_out[0..y_width.len()].copy_from_slice(y_width);
+            cb_out[0..cb_width.len()].copy_from_slice(cb_width);
+            cr_out[0..cr_width.len()].copy_from_slice(cr_width);
+            // we handle widths less than 16 a bit differently, allocating a temporary
+            // buffer and writing to that and then flushing to the out buffer
+            // because of the optimizations applied below,
+            (color_convert_16)(&y_out, &cb_out, &cr_out, &mut temp, &mut 0);
+            // copy to stride
+            out[0..width * num_components].copy_from_slice(&temp[0..width * num_components]);
+            // next
+            continue;
+        }
+
+        // Chunk in outputs of 16 to pass to color_convert as an array of 16 i16's.
+        for (((y, cb), cr), out_c) in y_width
+            .chunks_exact(16)
+            .zip(cb_width.chunks_exact(16))
+            .zip(cr_width.chunks_exact(16))
+            .zip(out.chunks_exact_mut(16 * num_components))
+        {
+            (color_convert_16)(
+                y.try_into().unwrap(),
+                cb.try_into().unwrap(),
+                cr.try_into().unwrap(),
+                out_c,
+                &mut 0
+            );
+        }
+        //we have more pixels in the end that can't be handled by the main loop.
+        //move pointer back a little bit to get last 16 bytes,
+        //color convert, and overwrite
+        //This means some values will be color converted twice.
+        for ((y, cb), cr) in y_width[width - 16..]
+            .chunks_exact(16)
+            .zip(cb_width[width - 16..].chunks_exact(16))
+            .zip(cr_width[width - 16..].chunks_exact(16))
+            .take(1)
+        {
+            (color_convert_16)(
+                y.try_into().unwrap(),
+                cb.try_into().unwrap(),
+                cr.try_into().unwrap(),
+                &mut temp,
+                &mut 0
+            );
+        }
+
+        let rem = out[(width - 16) * num_components..]
+            .chunks_exact_mut(16 * num_components)
+            .next()
+            .unwrap();
+
+        rem.copy_from_slice(&temp[0..rem.len()]);
+    }
+}
+pub(crate) fn upsample(
+    component: &mut Components, mcu_height: usize, i: usize, upsampler_scratch_space: &mut [i16]
+) {
+    match component.sample_ratio {
+        SampleRatios::V | SampleRatios::HV => {
+            /*
+            When upsampling vertically sampled images, we have a certain problem
+            which is that we do not have all MCU's decoded, this usually sucks at boundaries
+            e.g we can't upsample the last mcu row, since the row_down currently doesn't exist
+
+            To solve this we need to do two things
+
+            1. Carry over coefficients when we lack enough data to upsample
+            2. Upsample when we have enough data
+
+            To achieve (1), we store a previous row, and the current row in components themselves
+            which will later be used to make (2)
+
+            To achieve (2), we take the stored previous row(second last MCU row),
+            current row(last mcu row) and row down(first row of newly decoded MCU)
+
+            and upsample that and store it in first_row_upsample_dest, this contains
+            up-sampled coefficients for the last for the previous decoded mcu row.
+
+            The caller is then expected to process first_row_upsample_dest before processing data
+            in component.upsample_dest which stores the up-sampled components excluding the last row
+            */
+
+            let mut dest_start = 0;
+            let stride_bytes_written = component.width_stride * component.sample_ratio.sample();
+
+            if i > 0 {
+                // Handle the last MCU of the previous row
+                // This wasn't up-sampled as we didn't have the row_down
+                // so we do it now
+
+                let stride = component.width_stride;
+
+                let dest = &mut component.first_row_upsample_dest[0..stride_bytes_written];
+
+                // get current row
+                let row = &component.row[..];
+                let row_up = &component.row_up[..];
+                let row_down = &component.raw_coeff[0..stride];
+                (component.up_sampler)(row, row_up, row_down, upsampler_scratch_space, dest);
+            }
+
+            // we have the Y component width stride.
+            // this may be higher than the actual width,(2x because vertical sampling)
+            //
+            // This will not upsample the last row
+
+            // if false, do not upsample.
+            // set to false on the last row of an mcu
+            let mut upsample = true;
+
+            let stride = component.width_stride * component.vertical_sample;
+            let stop_offset = component.raw_coeff.len() / component.width_stride;
+            for (pos, curr_row) in component
+                .raw_coeff
+                .chunks_exact(component.width_stride)
+                .enumerate()
+            {
+                let mut dest: &mut [i16] = &mut [];
+                let mut row_up: &[i16] = &[];
+                // row below current sample
+                let mut row_down: &[i16] = &[];
+
+                // Order of ifs matters
+
+                if i == 0 && pos == 0 {
+                    // first IMAGE row, row_up is the same as current row
+                    // row_down is the row below.
+                    row_up = &component.raw_coeff[pos * stride..(pos + 1) * stride];
+                    row_down = &component.raw_coeff[(pos + 1) * stride..(pos + 2) * stride];
+                } else if i > 0 && pos == 0 {
+                    // first row of a new mcu, previous row was copied so use that
+                    row_up = &component.row[..];
+                    row_down = &component.raw_coeff[(pos + 1) * stride..(pos + 2) * stride];
+                } else if i == mcu_height.saturating_sub(1) && pos == stop_offset - 1 {
+                    // last IMAGE row, adjust pointer to use previous row and current row
+                    row_up = &component.raw_coeff[(pos - 1) * stride..pos * stride];
+                    row_down = &component.raw_coeff[pos * stride..(pos + 1) * stride];
+                } else if pos > 0 && pos < stop_offset - 1 {
+                    // other rows, get row up and row down relative to our current row
+                    // ignore last row of each mcu
+                    row_up = &component.raw_coeff[(pos - 1) * stride..pos * stride];
+                    row_down = &component.raw_coeff[(pos + 1) * stride..(pos + 2) * stride];
+                } else if pos == stop_offset - 1 {
+                    // last MCU in a row
+                    //
+                    // we need a row at the next MCU but we haven't decoded that MCU yet
+                    // so we should save this and when we have the next MCU,
+                    // do the upsampling
+
+                    // store the current row and previous row in a buffer
+                    let prev_row = &component.raw_coeff[(pos - 1) * stride..pos * stride];
+
+                    component.row_up.copy_from_slice(prev_row);
+                    component.row.copy_from_slice(curr_row);
+                    upsample = false;
+                } else {
+                    unreachable!("Uh oh!");
+                }
+                if upsample {
+                    dest =
+                        &mut component.upsample_dest[dest_start..dest_start + stride_bytes_written];
+                    dest_start += stride_bytes_written;
+                }
+
+                if upsample {
+                    // upsample
+                    (component.up_sampler)(
+                        curr_row,
+                        row_up,
+                        row_down,
+                        upsampler_scratch_space,
+                        dest
+                    );
+                }
+            }
+        }
+        SampleRatios::H => {
+            assert_eq!(component.raw_coeff.len() * 2, component.upsample_dest.len());
+
+            let raw_coeff = &component.raw_coeff;
+            let dest_coeff = &mut component.upsample_dest;
+
+            // upsample each row
+            for (single_row, output_stride) in raw_coeff
+                .chunks_exact(component.width_stride)
+                .zip(dest_coeff.chunks_exact_mut(component.width_stride * 2))
+            {
+                // upsample using the fn pointer, should only be H, so no need for
+                // row up and row down
+                (component.up_sampler)(single_row, &[], &[], &mut [], output_stride);
+            }
+        }
+        SampleRatios::None => {}
+    };
+}
--- a/third_party/zune-jpeg/tests/invalid_images.rs
+++ b/third_party/zune-jpeg/tests/invalid_images.rs
@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2023.
+ *
+ * This software is free software;
+ *
+ * You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
+ */
+
+use zune_core::bytestream::ZCursor;
+use zune_jpeg::JpegDecoder;
+
+#[test]
+fn eof() {
+    let mut decoder = JpegDecoder::new(ZCursor::new([0xff, 0xd8, 0xa4]));
+
+    decoder.decode().unwrap_err();
+}
+
+#[test]
+fn bad_ff_marker_size() {
+    let mut decoder = JpegDecoder::new(ZCursor::new([0xff, 0xd8, 0xff, 0x00, 0x00, 0x00]));
+
+    let _ = decoder.decode().unwrap_err();
+}
+
+#[test]
+fn bad_number_of_scans() {
+    let mut decoder = JpegDecoder::new(ZCursor::new([255, 216, 255, 218, 232, 197, 255]));
+
+    let err = decoder.decode().unwrap_err();
+
+    assert!(
+        matches!(err, zune_jpeg::errors::DecodeErrors::SosError(x) if x == "Bad SOS length 59589,corrupt jpeg")
+    );
+}
+
+#[test]
+fn huffman_length_subtraction_overflow() {
+    let mut decoder = JpegDecoder::new(ZCursor::new([255, 216, 255, 196, 0, 0]));
+
+    let err = decoder.decode().unwrap_err();
+
+    assert!(
+        matches!(err, zune_jpeg::errors::DecodeErrors::FormatStatic(x) if x == "Invalid Huffman length in image")
+    );
+}
+
+#[test]
+fn index_oob() {
+    let mut decoder = JpegDecoder::new(ZCursor::new([255, 216, 255, 218, 0, 8, 1, 0, 8, 1]));
+
+    let _ = decoder.decode().unwrap_err();
+}
+
+#[test]
+fn mul_with_overflow() {
+    let mut decoder = JpegDecoder::new(ZCursor::new([
+        255, 216, 255, 192, 255, 1, 8, 9, 119, 48, 255, 192
+    ]));
+
+    let err = decoder.decode().unwrap_err();
+
+    assert!(
+        matches!(err, zune_jpeg::errors::DecodeErrors::SofError(x) if x == "Length of start of frame differs from expected 584,value is 65281")
+    );
+}