jpeg: Android and CMake build support

From 9dd8272228382d343120ca0dd17353c45b24ea76 Mon Sep 17 00:00:00 2001
From: Sam Lantinga <[EMAIL REDACTED]>
Date: Sat, 7 May 2022 10:28:56 -0700
Subject: [PATCH] Android and CMake build support

---
 Android.mk     |  37 ++++
 CMakeLists.txt |  29 +++
 jconfig.h      | 151 ++++++++++++++++
 jidctfst.S     | 476 +++++++++++++++++++++++++++++++++++++++++++++++++
 jmem-android.c | 187 +++++++++++++++++++
 5 files changed, 880 insertions(+)
 create mode 100644 Android.mk
 create mode 100644 CMakeLists.txt
 create mode 100644 jconfig.h
 create mode 100644 jidctfst.S
 create mode 100644 jmem-android.c

diff --git a/Android.mk b/Android.mk
new file mode 100644
index 0000000..3ae33d2
--- /dev/null
+++ b/Android.mk
@@ -0,0 +1,37 @@
+LOCAL_PATH:= $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_ARM_MODE := arm
+
+LOCAL_SRC_FILES := \
+    jaricom.c jcapimin.c jcapistd.c jcarith.c jccoefct.c jccolor.c \
+    jcdctmgr.c jchuff.c jcinit.c jcmainct.c jcmarker.c jcmaster.c \
+    jcomapi.c jcparam.c jcprepct.c jcsample.c jctrans.c jdapimin.c \
+    jdapistd.c jdarith.c jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c \
+    jddctmgr.c jdhuff.c jdinput.c jdmainct.c jdmarker.c jdmaster.c \
+    jdmerge.c jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c \
+    jfdctfst.c jfdctint.c jidctflt.c jquant1.c \
+    jquant2.c jutils.c jmemmgr.c \
+	jmem-android.c
+
+# the assembler is only for the ARM version, don't break the Linux sim
+ifneq ($(TARGET_ARCH),arm)
+ANDROID_JPEG_NO_ASSEMBLER := true
+endif
+
+# temp fix until we understand why this broke cnn.com
+ANDROID_JPEG_NO_ASSEMBLER := true
+
+ifeq ($(strip $(ANDROID_JPEG_NO_ASSEMBLER)),true)
+LOCAL_SRC_FILES += jidctint.c jidctfst.c
+else
+LOCAL_SRC_FILES += jidctint.c jidctfst.S
+endif
+
+LOCAL_CFLAGS += -DAVOID_TABLES
+LOCAL_CFLAGS += -O3 -fstrict-aliasing -fprefetch-loop-arrays
+#LOCAL_CFLAGS += -march=armv6j
+
+LOCAL_MODULE:= jpeg
+
+include $(BUILD_STATIC_LIBRARY)
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..3b49a85
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.1)
+project(jpeg C)
+
+add_library(jpeg STATIC)
+target_sources(jpeg PRIVATE
+		jaricom.c jcapimin.c jcapistd.c jcarith.c jccoefct.c jccolor.c
+		jcdctmgr.c jchuff.c jcinit.c jcmainct.c jcmarker.c jcmaster.c
+		jcomapi.c jcparam.c jcprepct.c jcsample.c jctrans.c jdapimin.c
+		jdapistd.c jdarith.c jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c
+		jddctmgr.c jdhuff.c jdinput.c jdmainct.c jdmarker.c jdmaster.c
+		jdmerge.c jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c
+		jfdctfst.c jfdctint.c jidctflt.c jquant1.c
+		jquant2.c jutils.c jmemmgr.c)
+
+if (ANDROID)
+	target_sources(jpeg PRIVATE jmem-android.c)
+else()
+	target_sources(jpeg PRIVATE jmemansi.c)
+endif()
+
+
+target_sources(jpeg PRIVATE jidctint.c jidctfst.c)
+
+# FIXME : include asm for ARM
+# target_sources(jpeg PRIVATE jidctint.c jidctfst.S)
+
+target_compile_definitions(jpeg PRIVATE -DAVOID_TABLES)
+
+target_include_directories(jpeg PUBLIC .)
diff --git a/jconfig.h b/jconfig.h
new file mode 100644
index 0000000..2d3c68e
--- /dev/null
+++ b/jconfig.h
@@ -0,0 +1,151 @@
+/* android jconfig.h */
+
+/*
+ * Copyright (C) 1991-1994, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file documents the configuration options that are required to
+ * customize the JPEG software for a particular system.
+ *
+ * The actual configuration options for a particular installation are stored
+ * in jconfig.h.  On many machines, jconfig.h can be generated automatically
+ * or copied from one of the "canned" jconfig files that we supply.  But if
+ * you need to generate a jconfig.h file by hand, this file tells you how.
+ */
+
+
+/*
+ * These symbols indicate the properties of your machine or compiler.
+ * #define the symbol if yes, #undef it if no.
+ */
+
+/* Does your compiler support function prototypes?
+ * (If not, you also need to use ansi2knr, see install.doc)
+ */
+#define HAVE_PROTOTYPES
+
+/* Does your compiler support the declaration "unsigned char" ?
+ * How about "unsigned short" ?
+ */
+#define HAVE_UNSIGNED_CHAR
+#define HAVE_UNSIGNED_SHORT
+
+/* Define "void" as "char" if your compiler doesn't know about type void.
+ * NOTE: be sure to define void such that "void *" represents the most general
+ * pointer type, e.g., that returned by malloc().
+ */
+/* #define void char */
+
+/* Define "const" as empty if your compiler doesn't know the "const" keyword.
+ */
+/* #define const */
+
+/* Define this if an ordinary "char" type is unsigned.
+ * If you're not sure, leaving it undefined will work at some cost in speed.
+ * If you defined HAVE_UNSIGNED_CHAR then the speed difference is minimal.
+ */
+#undef CHAR_IS_UNSIGNED
+
+/* Define this if your system has an ANSI-conforming <stddef.h> file.
+ */
+#define HAVE_STDDEF_H 1
+
+/* Define this if your system has an ANSI-conforming <stdlib.h> file.
+ */
+#define HAVE_STDLIB_H 1
+
+/* Define this if your system does not have an ANSI/SysV <string.h>,
+ * but does have a BSD-style <strings.h>.
+ */
+#undef NEED_BSD_STRINGS
+
+/* Define this if your system does not provide typedef size_t in any of the
+ * ANSI-standard places (stddef.h, stdlib.h, or stdio.h), but places it in
+ * <sys/types.h> instead.
+ */
+#undef NEED_SYS_TYPES_H
+
+/* For 80x86 machines, you need to define NEED_FAR_POINTERS,
+ * unless you are using a large-data memory model or 80386 flat-memory mode.
+ * On less brain-damaged CPUs this symbol must not be defined.
+ * (Defining this symbol causes large data structures to be referenced through
+ * "far" pointers and to be allocated with a special version of malloc.)
+ */
+#undef NEED_FAR_POINTERS
+
+/* Define this if your linker needs global names to be unique in less
+ * than the first 15 characters.
+ */
+#undef NEED_SHORT_EXTERNAL_NAMES
+
+/* Although a real ANSI C compiler can deal perfectly well with pointers to
+ * unspecified structures (see "incomplete types" in the spec), a few pre-ANSI
+ * and pseudo-ANSI compilers get confused.  To keep one of these bozos happy,
+ * define INCOMPLETE_TYPES_BROKEN.  This is not recommended unless you
+ * actually get "missing structure definition" warnings or errors while
+ * compiling the JPEG code.
+ */
+#undef INCOMPLETE_TYPES_BROKEN
+
+
+/*
+ * The following options affect code selection within the JPEG library,
+ * but they don't need to be visible to applications using the library.
+ * To minimize application namespace pollution, the symbols won't be
+ * defined unless JPEG_INTERNALS has been defined.
+ */
+
+#ifdef JPEG_INTERNALS
+
+/* Define this if your compiler implements ">>" on signed values as a logical
+ * (unsigned) shift; leave it undefined if ">>" is a signed (arithmetic) shift,
+ * which is the normal and rational definition.
+ */
+#undef RIGHT_SHIFT_IS_UNSIGNED
+
+#endif /* JPEG_INTERNALS */
+
+
+/*
+ * The remaining options do not affect the JPEG library proper,
+ * but only the sample applications cjpeg/djpeg (see cjpeg.c, djpeg.c).
+ * Other applications can ignore these.
+ */
+
+#ifdef JPEG_CJPEG_DJPEG
+
+/* These defines indicate which image (non-JPEG) file formats are allowed. */
+
+#define BMP_SUPPORTED		/* BMP image file format */
+#define GIF_SUPPORTED		/* GIF image file format */
+#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
+#undef RLE_SUPPORTED		/* Utah RLE image file format */
+#define TARGA_SUPPORTED		/* Targa image file format */
+
+/* Define this if you want to name both input and output files on the command
+ * line, rather than using stdout and optionally stdin.  You MUST do this if
+ * your system can't cope with binary I/O to stdin/stdout.  See comments at
+ * head of cjpeg.c or djpeg.c.
+ */
+#undef TWO_FILE_COMMANDLINE
+
+/* Define this if your system needs explicit cleanup of temporary files.
+ * This is crucial under MS-DOS, where the temporary "files" may be areas
+ * of extended memory; on most other systems it's not as important.
+ */
+#undef NEED_SIGNAL_CATCHER
+
+/* By default, we open image files with fopen(...,"rb") or fopen(...,"wb").
+ * This is necessary on systems that distinguish text files from binary files,
+ * and is harmless on most systems that don't.  If you have one of the rare
+ * systems that complains about the "b" spec, define this symbol.
+ */
+#undef DONT_USE_B_MODE
+
+/* Define this if you want percent-done progress reports from cjpeg/djpeg.
+ */
+#undef PROGRESS_REPORT
+
+
+#endif /* JPEG_CJPEG_DJPEG */
diff --git a/jidctfst.S b/jidctfst.S
new file mode 100644
index 0000000..b35f938
--- /dev/null
+++ b/jidctfst.S
@@ -0,0 +1,476 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <machine/cpu-features.h>
+
+    .text
+    .align
+
+    .global jpeg_idct_ifast
+    .func   jpeg_idct_ifast
+
+// NOTE: sb=r9, fp=r11 ip=r12, sp=r13, lr=r14, pc=r15
+
+// jpeg_idct_ifast (j_decompress_ptr       cinfo,
+//                 jpeg_component_info *   compptr,
+//                 short*                  coef_block,
+//                 unsigned char*          output_buf,
+//                 int                     output_col)
+
+#define  local_TMP0123       sp
+#define  local_TMP0          [sp, #0]
+#define  local_TMP1          [sp, #4]
+#define  local_TMP2          [sp, #8]
+#define  local_TMP3          [sp, #12]
+#define  local_RANGE_TABLE   [sp, #16]
+#define  local_OUTPUT_COL    [sp, #20]
+#define  local_OUTPUT_BUF    [sp, #24]
+#define  local_UNUSED        [sp, #28]
+#define  off_WORKSPACE       32
+#define  local_WORKSPACE     [sp, #offWORKSPACE]
+#define  local_SIZE          (off_WORKSPACE + 8*8*4)
+
+#define  off_DECOMPRESS_range_limit_base  324
+#define  off_COMPINFO_quanttable          80
+
+#define  DCTSIZE   8
+#define  VY(x)   ((x)*DCTSIZE*2)
+#define  QY(x)   ((x)*DCTSIZE*4)
+
+#define  VX(x)   ((x)*2)
+#define  QX(x)   ((x)*4)
+
+#define  FIX_1_414213562    #362
+#define  FIX_1_082392200    #277
+#define  FIX_1_847759065    #473
+#define  FIX_2_613125930    #669
+
+#define  RANGE_MASK   1023
+
+
+
+jpeg_idct_ifast:
+    PLD     [r2, #0]
+    stmdb   sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
+    ldr     r4, [sp, #4*10]
+    sub     sp, #local_SIZE
+
+    ldr     r10,[r1, #off_COMPINFO_quanttable]         // r10 = quanttable
+    str     r4, local_OUTPUT_COL
+    str     r3, local_OUTPUT_BUF
+    ldr     r5, [r0, #off_DECOMPRESS_range_limit_base]
+    add     r5, r5, #128
+    str     r5, local_RANGE_TABLE
+    mov     fp, r2                                      // fp = coef_block
+    add     ip, sp, #off_WORKSPACE
+
+VLoopTail:
+    ldrsh    r0, [fp, #VY(0)]
+    ldrsh    r1, [fp, #VY(1)]
+    ldrsh    r2, [fp, #VY(2)]
+    ldrsh    r3, [fp, #VY(3)]
+    ldrsh    r4, [fp, #VY(4)]
+    ldrsh    r5, [fp, #VY(5)]
+    ldrsh    r6, [fp, #VY(6)]
+    ldrsh    r7, [fp, #VY(7)]
+
+    cmp      r1, #0
+    orreqs   r8, r2, r3
+    orreqs   r8, r4, r5
+    orreqs   r8, r6, r7
+    beq      VLoopHeadZero
+
+VLoopHead:
+    // tmp0 = DEQUANTIZE(in[DCTSIZE*0], quant[DCTSIZE*0]   (r0)
+    // tmp2 = DEQUANTIZE(in[DCTSIZE*4], quant[DCTSIZE*4]   (r4)
+    // tmp1 = DEQUANTIZE(in[DCTSIZE*2], quant[DCTSIZE*2]   (r2)
+    // tmp3 = DEQUANTIZE(in[DCTSIZE*6], quant[DCTSIZE*6]   (r6)
+    // tmp10 = tmp0 + tmp2   (r0)
+    // tmp11 = tmp0 - tmp2   (r4)
+
+    ldr      r9, [r10, #QY(4)]
+    ldr      r8, [r10, #QY(0)]
+#if __ARM_HAVE_HALFWORD_MULTIPLY
+    smulbb   r4, r9, r4
+    smlabb   r0, r8, r0, r4
+#else
+    mul      r4, r9, r4
+    mul      r0, r8, r0
+    add      r0, r4
+#endif
+    ldr      r9, [r10, #QY(6)]
+    ldr      r8, [r10, #QY(2)]
+    sub      r4, r0, r4, lsl #1
+#if __ARM_HAVE_HALFWORD_MULTIPLY
+    smulbb   r6, r9, r6
+    smlabb   r2, r8, r2, r6
+#else
+    mul      r6, r9, r6
+    mul      r2, r8, r2
+    add      r2, r6
+#endif
+
+    // tmp13 = tmp1 + tmp3                                       (r2)
+    // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13    (r6)
+    // FIX_1_4142... = 362 = 45*8 + 2
+    sub      r6, r2, r6, lsl #1
+    mov      r8, #360
+    add      r8, r8, #2
+    mul      r9, r6, r8
+
+    // tmp0 = tmp10 + tmp13;   (r0)
+    // tmp3 = tmp10 - tmp13;   (r8)
+    // tmp1 = tmp11 + tmp12;   (r4)
+    // tmp2 = tmp11 - tmp12;   (r6)
+    add     r0, r0, r2
+    rsb     r6, r2, r9, asr #8
+    sub     r8, r0, r2, lsl #1
+    add     r4, r4, r6
+    sub     r6, r4, r6, lsl #1
+
+    stmia   local_TMP0123, {r0, r4, r6, r8}
+
+    // NOTE: be sure to not user r0,r4,r6,r8 soon after stm above
+
+    // odd part
+    // tmp4 = DEQUANTIZE( in[DCTSIZE*1], quant[DCTSIZE*1] )   (r1)
+    // tmp6 = DEQUANTIZE( in[DCTSIZE*5], quant[DCTSIZE*5] )   (r5)
+    // tmp5 = DEQUANTIZE( in[DCTSIZE*3], quant[DCTSIZE*3] )   (r3)
+    // tmp7 = DEQUANTIZE( in[DCTSIZE*7], quant[DCTSIZE*7] )   (r7)
+    // z13 = tmp6 + tmp5;  (r0)
+    // z10 = tmp6 - tmp5;  (r2)
+    // z11 = tmp4 + tmp7;  (r4)
+    // z12 = tmp4 - tmp7;  (r6)
+
+    ldr     r2, [r10, #QY(1)]
+    ldr     r9, [r10, #QY(5)]
+#if __ARM_HAVE_HALFWORD_MULTIPLY
+    smulbb  r1, r2, r1
+#else
+    mul     r1, r2, r1
+#endif
+    ldr     r2, [r10, #QY(3)]
+#if __ARM_HAVE_HALFWORD_MULTIPLY
+    smulbb  r5, r9, r5
+#else
+    mul     r5, r9, r5
+#endif
+    ldr     r9, [r10, #QY(7)]
+#if __ARM_HAVE_HALFWORD_MULTIPLY
+    smlabb  r0, r2, r3, r5
+    smlabb  r4, r9, r7, r1
+#else
+    mul     r0, r2, r3
+    add     r0, r5
+    mul     r4, r9, r7
+    add     r4, r1
+#endif
+    rsb  r2, r0, r5, lsl #1
+    rsb  r6, r4, r1, lsl #1
+
+    // tmp7 = z11 + z13;                             (r7)
+    // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
+    // FIX_... = 360 + 2
+    add   r7, r4, r0
+    sub   r1, r4, r0
+    mov   r8, #360
+    add   r8, r8, #2
+    mul   r1, r8, r1
+
+    // z5 = MULTIPLY(z10 + z12, FIX_1_847759065);        (r8)
+    // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;      (r0)
+    // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;    (r2)
+    // FIX_1_8477... = 473 = 472 + 1
+    // FIX_1_082...  = 277 = 276 + 1
+    // FIX_2_...     = 669 = 668 + 1
+    add     r8, r2, r6
+    mov     r9, #472
+    mla     r8, r9, r8, r8
+    mov     r9, #276
+    mla     r0, r6, r9, r6
+    mov     r9, #668
+    mla     r2, r9, r2, r2
+    sub     r0, r0, r8
+    rsb     r2, r2, r8
+
+    // tmp6 = tmp12 - tmp7;  (r6)
+    // tmp5 = tmp11 - tmp6;  (r5)
+    // tmp4 = tmp10 + tmp5;  (r4)
+    rsb  r6, r7, r2, asr #8
+    rsb  r5, r6, r1, asr #8
+    add  r4, r5, r0, asr #8
+
+    ldmia local_TMP0123, {r0, r1, r2, r3}
+
+    // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
+    // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
+    // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
+    // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
+    // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
+    // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
+    // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
+    // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
+
+    add   r0, r0, r7
+    sub   r7, r0, r7, lsl #1
+    add   r1, r1, r6
+    sub   r6, r1, r6, lsl #1
+    add   r2, r2, r5
+    sub   r5, r2, r5, lsl #1
+    sub   r3, r3, r4
+    add   r4, r3, r4, lsl #1
+
+    str   r0, [ip, #QY(0)]
+    str   r1, [ip, #QY(1)]
+    str   r2, [ip, #QY(2)]
+    str   r3, [ip, #QY(3)]
+    str   r4, [ip, #QY(4)]
+    str   r5, [ip, #QY(5)]
+    str   r6, [ip, #QY(6)]
+    str   r7, [ip, #QY(7)]
+
+    // inptr++;                    /* advance pointers to next column */
+    // quantptr++;
+    // wsptr++;
+    add  fp, fp, #2
+    add  r10, r10, #4
+    add  ip, ip, #4
+    add  r0, sp, #(off_WORKSPACE + 4*8)
+    cmp  ip, r0
+    bne  VLoopTail
+
+
+
+HLoopStart:
+    // reset pointers
+    PLD     [sp, #off_WORKSPACE]
+    add     ip, sp, #off_WORKSPACE
+    ldr     r10, local_RANGE_TABLE
+
+HLoopTail:
+    // output = *output_buf++ + output_col
+    ldr      r0, local_OUTPUT_BUF
+    ldr      r1, local_OUTPUT_COL
+    ldr      r2, [r0], #4
+    str      r0, local_OUTPUT_BUF
+    add      fp, r2, r1
+
+    PLD      [ip, #32]
+    ldmia    ip!, {r0-r7}
+
+    cmp      r1, #0
+    orreqs   r8, r2, r3
+    orreqs   r8, r4, r5
+    orreqs   r8, r6, r7
+    beq      HLoopTailZero
+
+HLoopHead:
+    // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);    (r0)
+    // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);    (r4)
+    add     r0, r0, r4
+    sub     r4, r0, r4, lsl #1
+
+    // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);                                   (r2)
+    // tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) - tmp13;  (r6)
+    // FIX_... = 360 + 2
+    add     r2, r2, r6
+    sub     r6, r2, r6, lsl #1
+    mov     r8, #360
+    add     r8, r8, #2
+    mul     r6, r8, r6
+
+    // tmp0 = tmp10 + tmp13;   (r0)
+    // tmp3 = tmp10 - tmp13;   (r8)
+    // tmp1 = tmp11 + tmp12;   (r4)
+    // tmp2 = tmp11 - tmp12;   (r6)
+    add     r0, r0, r2
+    rsb     r6, r2, r6, asr #8
+    sub     r8, r0, r2, lsl #1
+    add     r4, r4, r6
+    sub     r6, r4, r6, lsl #1
+
+    stmia   local_TMP0123, {r0, r4, r6, r8}
+
+    // Odd part
+
+    // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];  (r0)
+    // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];  (r2)
+    // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];  (r4)
+    // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];  (r6)
+    add  r0, r5, r3
+    sub  r2, r5, r3
+    add  r4, r1, r7
+    sub  r6, r1, r7
+
+    // tmp7 = z11 + z13;                             (r7)
+    // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
+    // FIX_... = 360 + 2
+    add   r7, r4, r0
+    sub   r1, r4, r0
+    mov   r8, #360
+    add   r8, r8, #2
+    mul   r1, r8, r1
+
+    // z5 = MULTIPLY(z10 + z12, FIX_1_847759065);        (r8)
+    // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;      (r0)
+    // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;    (r2)
+    // FIX_1_8477... = 473 = 472 + 1
+    // FIX_1_082...  = 277 = 276 + 1
+    // FIX_2_...     = 669 = 668 + 1
+    add  r8, r2, r6
+    mov  r9, #472
+    mla  r8, r9, r8, r8
+    mov  r9, #276
+    mla  r0, r6, r9, r6
+    mov  r9, #668
+    mla  r2, r9, r2, r2
+    sub  r0, r0, r8
+    sub  r2, r8, r2
+
+    // tmp6 = tmp12 - tmp7;  (r6)
+    // tmp5 = tmp11 - tmp6;  (r5)
+    // tmp4 = tmp10 + tmp5;  (r4)
+    rsb  r6, r7, r2, asr #8
+    rsb  r5, r6, r1, asr #8
+    add  r4, r5, r0, asr #8
+
+    ldmia local_TMP0123, {r0, r1, r2, r3}
+
+    // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) & RANGE_MASK];
+    // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) & RANGE_MASK];
+    // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) & RANGE_MASK];
+    // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) & RANGE_MASK];
+    // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) & RANGE_MASK];
+    // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) & RANGE_MASK];
+    // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) & RANGE_MASK];
+    // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) & RANGE_MASK];
+
+    mov    r8, #128
+    add    r0, r0, r7
+    sub    r7, r0, r7, lsl #1
+    add    r0, r8, r0, asr #5
+    add    r7, r8, r7, asr #5
+    add    r1, r1, r6
+    sub    r6, r1, r6, lsl #1
+    add    r1, r8, r1, asr #5
+    add    r6, r8, r6, asr #5
+    add    r2, r2, r5
+    sub    r5, r2, r5, lsl #1
+    add    r2, r8, r2, asr #5
+    add    r5, r8, r5, asr #5
+    sub    r3, r3, r4
+    add    r4, r3, r4, lsl #1
+    add    r3, r8, r3, asr #5
+    add    r4, r8, r4, asr #5
+
+#if __ARM_ARCH__ >= 6
+    usat   r0, #8, r0
+    usat   r1, #8, r1
+    usat   r2, #8, r2
+    usat   r3, #8, r3
+    usat   r4, #8, r4
+    usat   r5, #8, r5
+    usat   r6, #8, r6
+    usat   r7, #8, r7
+#else
+    cmp    r0, #255
+    mvnhi  r0, r0, asr #31
+    andhi  r0, #255
+    cmp    r7, #255
+    mvnhi  r7, r7, asr #31
+    cmp    r1, #255
+    mvnhi  r1, r1, asr #31
+    andhi  r1, #255
+    cmp    r6, #255
+    mvnhi  r6, r6, asr #31
+    andhi  r6, #255
+    cmp    r2, #255
+    mvnhi  r2, r2, asr #31
+    andhi  r2, #255
+    cmp    r5, #255
+    mvnhi  r5, r5, asr #31
+    andhi  r5, #255
+    cmp    r3, #255
+    mvnhi  r3, r3, asr #31
+    cmp    r4, #255
+    mvnhi  r4, r4, asr #31
+    andhi  r4, #255
+#endif
+
+    // r3 r2 r1 r0
+    orr    r0, r0, r1, lsl #8
+    orr    r0, r0, r2, lsl #16
+    orr    r0, r0, r3, lsl #24
+
+    // r7 r6 r5 r4
+    orr    r1, r4, r5, lsl #8
+    orr    r1, r1, r6, lsl #16
+    orr    r1, r1, r7, lsl #24
+    stmia  fp, {r0, r1}
+
+    add    r0, sp, #(off_WORKSPACE + 8*8*4)
+    cmp    ip, r0
+    bne    HLoopTail
+
+Exit:
+    add    sp, sp, #local_SIZE
+    ldmia  sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
+    bx     lr
+
+
+VLoopHeadZero:
+// ok, all AC coefficients are 0
+    ldr      r1, [r10, #QY(0)]
+    add      fp, fp, #2
+    add      r10, r10, #4
+    mul      r0, r1, r0
+    str      r0, [ip, #QY(0)]
+    str      r0, [ip, #QY(1)]
+    str      r0, [ip, #QY(2)]
+    str      r0, [ip, #QY(3)]
+    str      r0, [ip, #QY(4)]
+    str      r0, [ip, #QY(5)]
+    str      r0, [ip, #QY(6)]
+    str      r0, [ip, #QY(7)]
+    add      ip, ip, #4
+    add      r0, sp, #(off_WORKSPACE + 4*8)
+    cmp      ip, r0
+    beq      HLoopStart
+    b        VLoopTail
+
+HLoopTailZero:
+    mov      r0, r0, asr #5
+    add      r0, #128
+
+#if __ARM_ARCH__ >= 6
+    usat     r0, #8, r0
+#else
+    cmp      r0, #255
+    mvnhi    r0, r0, asr #31
+    andhi    r0, r0, #255
+#endif
+
+    orr      r0, r0, lsl #8
+    orr      r0, r0, lsl #16
+    mov      r1, r0
+    stmia    fp, {r0, r1}
+
+    add      r0, sp, #(off_WORKSPACE + 64*4)
+    cmp      ip, r0
+    beq      Exit
+    b        HLoopTail
+
+    .endfunc
diff --git a/jmem-android.c b/jmem-android.c
new file mode 100644
index 0000000..eefb8ac
--- /dev/null
+++ b/jmem-android.c
@@ -0,0 +1,187 @@
+/*
+ * Copyright (C) 2007-2008 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jmemsys.h"		/* import the system-dependent declarations */
+#include <unistd.h>         /* For unlink() and getpid() */
+
+#ifndef HAVE_STDLIB_H		/* <stdlib.h> should declare malloc(),free() */
+extern void * malloc JPP((size_t size));
+extern void free JPP((void *ptr));
+#endif
+
+#ifndef SEEK_SET		/* pre-ANSI systems may not define this; */
+#define SEEK_SET  0		/* if not, assume 0 is correct */
+#endif
+
+
+/*
+ * Memory allocation and freeing are controlled by the regular library
+ * routines malloc() and free().
+ */
+
+GLOBAL(void *)
+jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject)
+{
+  return (void *) malloc(sizeofobject);
+}
+
+GLOBAL(void)
+jpeg_free_small (j_common_ptr cinfo, void * object, size_t sizeofobject)
+{
+  free(object);
+}
+
+
+/*
+ * "Large" objects are treated the same as "small" ones.
+ * NB: although we include FAR keywords in the routine declarations,
+ * this file won't actually work in 80x86 small/medium model; at least,
+ * you probably won't be able to process useful-size images in only 64KB.
+ */
+
+GLOBAL(void FAR *)
+jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject)
+{
+  return (void FAR *) malloc(sizeofobject);
+}
+
+GLOBAL(void)
+jpeg_free_large (j_common_ptr cinfo, void FAR * object, size_t sizeofobject)
+{
+  free(object);
+}
+
+
+/*
+ * This routine computes the total memory space available for allocation.
+ * It's impossible to do this in a portable way; our current solution is
+ * to make the user tell us (with a default value set at compile time).
+ * If you can actually get the available space, it's a good idea to subtract
+ * a slop factor of 5% or so.
+ */
+
+#ifndef DEFAULT_MAX_MEM		/* so can override from makefile */
+#define DEFAULT_MAX_MEM		10000000L /* default: ten megabyte */
+#endif
+
+GLOBAL(long)
+jpeg_mem_available (j_common_ptr cinfo, long min_bytes_needed,
+		    long max_bytes_needed, long already_allocated)
+{
+  return cinfo->mem->max_memory_to_use - already_allocated;
+}
+
+
+/*
+ * Backing store (temporary file) management.
+ * Backing store objects are only used when the value returned by
+ * jpeg_mem_available is less than the total space needed.  You can dispense
+ * with these routines if you have plenty of virtual memory; see jmemnobs.c.
+ */
+
+
+METHODDEF(void)
+read_backing_store (j_common_ptr cinfo, backing_store_ptr info,
+		    void FAR * buffer_address,
+		    long file_offset, long byte_count)
+{
+  if (fseek(info->temp_file, file_offset, SEEK_SET))
+    ERREXIT(cinfo, JERR_TFILE_SEEK);
+  if (JFREAD(info->temp_file, buffer_address, byte_count)
+      != (size_t) byte_count)
+    ERREXIT(cinfo, JERR_TFILE_READ);
+}
+
+
+METHODDEF(void)
+write_backing_store (j_common_ptr cinfo, backing_store_ptr info,
+		     void FAR * buffer_address,
+		     long file_offset, long byte_count)
+{
+  if (fseek(info->temp_file, file_offset, SEEK_SET))
+    ERREXIT(cinfo, JERR_TFILE_SEEK);
+  if (JFWRITE(info->temp_file, buffer_address, byte_count)
+      != (size_t) byte_count)
+    ERREXIT(cinfo, JERR_TFILE_WRITE);
+}
+
+
+METHODDEF(void)
+close_backing_store (j_common_ptr cinfo, backing_store_ptr info)
+{
+  fclose(info->temp_file);
+  /* Since this implementation uses tmpfile() to create the file,
+   * no explicit file deletion is needed.
+   */
+}
+
+static FILE* getTempFileFromPath(const char * path) {
+    FILE * fd = fopen(path, "w+");
+    unlink(path);
+    return fd;
+}
+
+static FILE* getTempFile() {
+    char path[1024];
+    snprintf(path, 1023, "/sdcard/.%d.tmp", getpid());
+    FILE * fd = getTempFileFromPath(path);
+    if (fd == NULL) {
+        // anywhere else we can create a temp file?
+		//	    snprintf(path, 1023, "/data/data/.%d.tmp", getpid());
+		//      fd = getTempFileFromPath(path);
+    }
+    return fd;
+}
+
+/*
+ * Initial opening of a backing-store object.
+ *
+ * This version uses tmpfile(), which constructs a suitable file name
+ * behind the scenes.  We don't have to use info->temp_name[] at all;
+ * indeed, we can't even find out the actual name of the temp file.
+ */
+
+GLOBAL(void)
+jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
+			 long total_bytes_needed)
+{
+  if ((info->temp_file = getTempFile()) == NULL)
+    ERREXITS(cinfo, JERR_TFILE_CREATE, "");
+  info->read_backing_store = read_backing_store;
+  info->write_backing_store = write_backing_store;
+  info->close_backing_store = close_backing_store;
+}
+
+
+/*
+ * These routines take care of any system-dependent initialization and
+ * cleanup required.
+ */
+
+GLOBAL(long)
+jpeg_mem_init (j_common_ptr cinfo)
+{
+  return DEFAULT_MAX_MEM;	/* default for max_memory_to_use */
+}
+
+GLOBAL(void)
+jpeg_mem_term (j_common_ptr cinfo)
+{
+  /* no work */
+}