As for copying images with integer factor scaling, it is bloatware to use divisions, floating points, or excessive branches. By converting a constant division to a multiplicative integer factor, the scaled copy can be made native.
//bloatware for copying image from input with stride iF, width iW, height iH to data with stride F, width W, height H on position x, y with scale s
void bloatwarescaledcopy1(const uint16_t* input, int iF, int iW, int iH, uint16_t* data, int F, int W, int H, int x, int y, int s){
for(int k=0; k<iH*s; k++){
if(y+k>=0 && y+k<H){
for(int l=0; l<iW*s; l++){
if(x+l>=0 && x+l<W){
if(input[k/s*iF+l/s] != 0xFFFF)
data[(y+k)*F+(x+l)] = input[k/s*iF+l/s];
}
}
}
}
}
//another bloatware which scales images with excessive branches
void bloatwarescaledcopy2(const uint16_t* input, int iF, int iW, int iH, uint16_t* data, int F, int W, int H, int x, int y, int s){
for(int k=0; k<iH; k++){for(int k1=0;k1<s;k1++){
if(y+k*s+k1>=0 && y+k*s+k1<H){
for(int l=0; l<iW; l++){for(int l1=0;l1<s;l1++){
if(x+l*s+l1>=0 && x+l*s+l1<W){
if(input[k*iF+l] != 0xFFFF)
data[(y+k*s+k1)*F+(x+l*s+l1)] = input[k*iF+l];
}
}}
}
}}
}
//floating point is even more bloated and may be affected by rounding errors
void bloatwarescaledcopy3(const uint16_t* input, int iF, int iW, int iH, uint16_t* data, int F, int W, int H, int x, int y, int s){
double r=1.0/s;
for(int k=0; k<iH*s; k++){
if(y+k>=0 && y+k<H){
for(int l=0; l<iW*s; l++){
if(x+l>=0 && x+l<W){
if(input[(int)(k*r)*iF+(int)(l*r)] != 0xFFFF)
data[(y+k)*F+(x+l)] = input[(int)(k*r)*iF+(int)(l*r)];
}
}
}
}
}
//still bloatware, since division is involved
void bloatwarescaledcopy4(const uint16_t* input, int iF, int iW, int iH, uint16_t* data, int F, int W, int H, int x, int y, int s){
int k1=0;if(k1<-y)k1=-y;int k2=iH*s;if(k2>H-y)k2=H-y;
int l1=0;if(l1<-x)l1=-x;int l2=iW*s;if(l2>W-x)l2=W-x; if(l2<=l1)return;
for(int k=k1; k<k2; k++){
uint16_t* o=data+((y+k)*F+x); const uint16_t* b=input+(k/s*iF);
for(int l=l1; l<l2; l++) o[l] = b[l/s]==0xFFFF?o[l]:b[l/s];
}
}
//transforming to multiplicative integer factor makes it native
void nativescaledcopy(const uint16_t* input, int iF, int iW, int iH, uint16_t* data, int F, int W, int H, int x, int y, int s){
int k1=0;if(k1<-y)k1=-y;int k2=iH*s;if(k2>H-y)k2=H-y;
int l1=0;if(l1<-x)l1=-x;int l2=iW*s;if(l2>W-x)l2=W-x; if(l2<=l1)return;
int64_t r=0xFFFFFFFFLL/s+1;
for(int k=k1; k<k2; k++){
uint16_t* o=data+((y+k)*F+x); const uint16_t* b=input+((k*r>>32)*iF);
for(int l=l1; l<l2; l++) o[l] = b[l*r>>32]==0xFFFF?o[l]:b[l*r>>32];
}
}
//depending on the constraints of width and height and scaling factor, if width and height are less than 32768 even after scaling, a 32-bit multiplication can be used instead of 64-bit multiplication which debloats it further
void nativescaledcopy32(const uint16_t* input, int iF, int iW, int iH, uint16_t* data, int F, int W, int H, int x, int y, int s){
int k1=0;if(k1<-y)k1=-y;int k2=iH*s;if(k2>H-y)k2=H-y;
int l1=0;if(l1<-x)l1=-x;int l2=iW*s;if(l2>W-x)l2=W-x; if(l2<=l1)return;
int r=0x7FFFFFFF/s; int f=31; if(r>0x7FFFFF){r>>=8;f-=8;} if(r>0x7FFFF){r>>=4;f-=4;} if(r>0x1FFFF){r>>=2;f-=2;} if(r>0xFFFF){r>>=1;f-=1;} r++;
for(int k=k1; k<k2; k++){
uint16_t* o=data+((y+k)*F+x); const uint16_t* b=input+((k*r>>f)*iF);
for(int l=l1; l<l2; l++) o[l] = b[l*r>>f]==0xFFFF?o[l]:b[l*r>>f];
}
}