Login
Back to forumReply to this topicGo to last reply

Posted By

Harry Potter
on 2022-06-01
04:51:08
 cc65 compressor too slow: help needed optimizing

Hi! I am working on a compression program for cc65. So far, it's doing well but very slowly. sad I am asking for help to optimize it. Following are two snippets that seem to be taking a lot of time:

--------------------
static struct {
unsigned cmppos;
unsigned len;
//unsigned char offs[2];
unsigned ratio;
} vorcuroffs, vorbestoffs;

unsigned char vo3_cur[32], vo3_best[32];

static int iscompvor3ratio ()
{
int i, cmp=11;
for (i=0; i<buflen; i++) {
if (vo3_cur<i><19) cmp+=EstimDist (vo3_cur<i>, 19);
else cmp+=GetHuffLen(&cin<i>, 1);
} return cmp;
}

extern unsigned char *inblockpos, *testblockpos;
#pragma zpsym ("inblockpos")
#pragma zpsym ("testblockpos")

unsigned char __fastcall__ iscompvor3a (void);

unsigned char iscompvor3 (void)
{
int testpos=CPos-1, i, j, k;
int first=CPos>=500?CPos-500:0;
unsigned len, len2, vorlen;
unsigned unlen=0;
int b2pos;
unsigned char c;
if (CPos<2) return 0;
brdrcol=2;
memset (&vo3_cur, 0, sizeof(vo3_cur));
memset (&vo3_best, 0, sizeof(vo3_best));
vorlen=0;
vorbestoffs.ratio=GetHuffLen (cin, buflen)+10;
for (; testpos>=first; testpos--) {
//inblockpos=&InBuffer[CPos];
testblockpos = &InBuffer[testpos];
//for (j=0; j<1; j++){
if (iscompvor3a ()==0) continue;
vorcuroffs.ratio = iscompvor3ratio();
if (vorcuroffs.ratio < vorbestoffs.ratio) {
//vorbestoffs.cmppos=CPos-testpos-1;
vorbestoffs.cmppos=testpos;
vorbestoffs.ratio = vorcuroffs.ratio;
//memcpy (&vo3_best, &vo3_cur, sizeof(vo3_cur));
__asm__ (
"\tldx\t_buflen\n"
"\tdex\n"
"@a01:\n"
"\tlda\t_vo3_cur,x\n"
"\tsta\t_vo3_best,x\n"
"\tdex\n"
"\tbpl\t@a01\n"
);
vorlen=1;
}
//}
}
brdrcol=0;
vorbestoffs.cmppos = CPos-vorbestoffs.cmppos-1;
return vorlen;
}
-------------------
_iscompvor3a:
ldx #0
;sta tmp1
ldy _buflen
dey
@a01: lda (_cin),y
clc
adc #9
sec
sbc (_testblockpos),y
sta _vo3_cur,y
cmp #19
bcs *+3
inx
dey
bpl @a01
txa
ldx #0
rts
---------------------

Any help would be appreciated.

Posted By

Mad
on 2022-05-31
15:29:24
 Re: cc65 compressor too slow: help needed optimizing

Maybe you have to go through your own process in optimizing stuff.. Some tips for that anyways:

- Here is a list of the cycle count of every asm instruction: http://www.oxyron.de/html/opcodes02.html
- You can use illegal opcodes
- especially lax is for me sometimes worth some gold (it loads accu and x at once with a value), you can see in this list that some forms of lax seem to be not stable: http://www.oxyron.de/html/opcodes02.html
- Use self modifying code where ever you can
- Unroll loops, most of the time you can even add a lot of functionality whilst also getting a lot faster with this
- conditional branches like beq,bne and so on use 3 cycles if they jump, if they don't jump they use 2 cycles. That means sometimes it's worth to take the inverse jump (bne instead of beq for instance)
- don't use jsr in timing critical code.. Jsr takes 6 cycles and rts takes 6 cycles,too which is a lot to bear (12 cycles :/).
- use tables wherever possible (if you have the memory precalc stuff) you even can use a table instead of an add if it uses lower cycles (or if you want to keep the carry bit intact).
- there is a lot more, but mainly you have to got through your own process.. Optimizing stuff is where the fun starts and most of the demoscene people rely solely on the speed of there routines.. So to say be a good optimizer for your self, it really will help you a lot of times on this platform.

good luck with your project..


Ok, some simple optimization anyways:
This one:
"@a01:\n"
"\tlda\t_vo3_cur,x\n"
"\tsta\t_vo3_best,x\n"
"\tdex\n"
"\tbpl\t@a01\n"

Maybe it's worth to do "fragment copies" here if it's a long region to copy:

inx ; no bpl used this way (maybe also got to copy more than 128 bytes by this)
beq .error ; copy extactly 256 bytes not possible with this technique I think
cpx #$10
bcc .no16loop ; maybe bmi if you have signed values in x
; implicit sec for sbc no need to set it (if bmi it would need a sec here)
.a
"\tlda\t_vo3_cur-0-1,x\n" ; -1 because of inx
"\tsta\t_vo3_best-0-1,x\n"
"\tlda\t_vo3_cur-1-1,x\n"
"\tsta\t_vo3_best-1-1,x\n"
"\tlda\t_vo3_cur-2-1,x\n"
"\tsta\t_vo3_best-2-1,x\n"
"\tlda\t_vo3_cur-3-1,x\n"
"\tsta\t_vo3_best-3-1,x\n"
"\tlda\t_vo3_cur-4-1,x\n"
"\tsta\t_vo3_best-4-1,x\n"
"\tlda\t_vo3_cur-5-1,x\n"
"\tsta\t_vo3_best-5-1,x\n"
....
"\tlda\t_vo3_cur-15-1,x\n"
"\tsta\t_vo3_best-15-1,x\n"
txa
sbc #$10
tax
cpx #$10
bcs .a ; implicit sec for sbc (if you use a bmi you need some sec somewhere before the sbc)
.no16loop

cpx #$00 ; got all already copied by the 16er loop?
beq .no
.b
"\tlda\t_vo3_cur-0-1,x\n"
"\tsta\t_vo3_best-0-1,x\n"
dex
bne .b
.no

etc..

You spare almost 16 times the:
dex
bpl .b
which is 16 * (2+3) = 80 - 11 = around 69 cycles for every 16er loop spared

I didn't test this code, maybe it's malicious just that you can get the idea, of unrolling atleast parts of this loop..

Another technique would be:

loop:
"\ttlda\t_vo3_cur+0\n"
"\tsta\t_vo3_best+0\n"
"\ttlda\t_vo3_cur+1\n"
"\tsta\t_vo3_best+1\n"
"\ttlda\t_vo3_cur+2\n"
"\tsta\t_vo3_best+2\n"
"\ttlda\t_vo3_cur+3\n"
"\tsta\t_vo3_best+3\n"
"\ttlda\t_vo3_cur+4\n"
"\tsta\t_vo3_best+4\n"
..
"\ttlda\t_vo3_cur+255\n"
"\tsta\t_vo3_best+255\n"
rts ; if you want to copy all 256 values this byte needs to be here

put rts into loop at position "loop + (x+1) * (3*2)" # (3 bytes for lda and 3 bytes for sta), you can use a table (actuall two tables, lo and hi) for the multiplication
jsr loop
remove rts from loop

(you just would have to insert an rts by selfmodifying code there, where you want the (completely unrolled) loop to stop.
But be aware that you have to remove the rts again afterwards.)

Maybe this doesn't help, but you asked for some suggestions.. I don't know what you code should do, I even don't know if this loop is a problem at all. This loop needs to copy more than let's say 32 bytes to get the optimizations working.. If it copies around 16 bytes everytime these optimizations are senseless..

For small loops you could use

lda postable,x ; this is (20-(x + 1))*(3*2)
sta .modify + 1
lda #$00
.modify beq loop2 ; this jumps into the unrolled code section at the right position no need for a hi and lo table this way, the unrolled code must be inverse this way and end with an rts like this:

loop2
"\ttlda\t_vo3_cur+19\n"
"\tsta\t_vo3_best+19\n"
...
"\ttlda\t_vo3_cur+0\n"
"\tsta\t_vo3_best+0\n"
rts

but this would only copy max 20 bytes


Posted By

Harry Potter
on 2022-05-31
14:47:33
 Re: cc65 compressor too slow: help needed optimizing

I thank you for your advice. I can inline some code for LZ77, but it wouldn't help much here.

Posted By

Mad
on 2022-05-31
15:16:29
 Re: cc65 compressor too slow: help needed optimizing

I just updated the comment above with some lengthy unroll stuff... Good luck with optimizing..

Posted By

Harry Potter
on 2022-05-31
15:24:48
 Re: cc65 compressor too slow: help needed optimizing

Again, I thank you for your help. I can't do the loop unrolling, as I need to copy a certain number of bytes, and that number changes.

Posted By

Mad
on 2022-06-01
08:28:39
 Re: cc65 compressor too slow: help needed optimizing

I cannot help you.. All three suggestions above can copy changing numbers of bytes (the amount is in the x register at start).. But anyhow have fun..
lda #$00 ; somewhere at program init
sta $f0 ; somewhere at program init

copyBytes:

lda rtstablehi,x
sta $f1
ldy rtstablelo,x
lda #OPCODE_RTS
sta ($f0),y ; places an rts in the unrolled part
jsr loop
lda #OPCODE_LDA_ABS
sta ($f0),y ; removes the rts in the unrolled part again
rts

unrolledPart
"\ttlda\t_vo3_cur+0\n"
"\tsta\t_vo3_best+0\n"
"\ttlda\t_vo3_cur+1\n"
"\tsta\t_vo3_best+1\n"
"\ttlda\t_vo3_cur+2\n"
"\tsta\t_vo3_best+2\n"
"\ttlda\t_vo3_cur+3\n"
"\tsta\t_vo3_best+3\n"
"\ttlda\t_vo3_cur+4\n"
"\tsta\t_vo3_best+4\n"
..
"\ttlda\t_vo3_cur+255\n"
"\tsta\t_vo3_best+255\n"
rts ; if you want to copy all 256 values this byte needs to be here

Just to clarify that..

edit: BTW sorry, you do a compression programm, unrolling maybe is not an option then (because of program size :) )..

edit2 :) :

lda jmptablo,x
sta j + 1
lda jmptabhi,x
sta j + 2
j: jmp unrolledPart
unrolledPart:
"\ttlda\t_vo3_cur+255\n"
"\tsta\t_vo3_best+255\n"
"\ttlda\t_vo3_cur+254\n"
"\tsta\t_vo3_best+254\n"
..
"\ttlda\t_vo3_cur+0\n"
"\tsta\t_vo3_best+0\n"


Posted By

Harry Potter
on 2022-05-31
15:40:18
 Re: cc65 compressor too slow: help needed optimizing

Thank you.

Posted By

Harry Potter
on 2022-05-31
16:52:57
 Re: cc65 compressor too slow: help needed optimizing

I managed to make it slightly faster by moving the testblockpos definition outside the loop and decrementing it each loop. happy

Posted By

Harry Potter
on 2022-06-01
14:54:44
 Re: cc65 compressor too slow: help needed optimizing

I'm sorry, but that I wasted your time. sad I deprecated the technique, because I found a technique that's both better and faster. happy It still pauses for split seconds at a time, though, and I suspect Adaptive Huffman Codes to be the culprit. Following are some of the functions:

--------------------
void __fastcall__ AddHuffNYT (void)
{
// struct HuffEnv_Entry//* top = GetHuffEntry (HuffEnv.top), //&HuffEnv.Entry[HuffEnv.top],
// //*bottomleft = GetHuffEntry (HuffEnv.top-2), //&HuffEnv.Entry[HuffEnv.top-2],
// *bottomright = GetHuffEntry (HuffEnv.top-1); //&HuffEnv.Entry[HuffEnv.top-1];
e1 = GetHuffEntry (HuffEnv.top-1);
//bottomleft = GetHuffEntry (HuffEnv.top-2);
bottomleft = e1; --bottomleft;
//backcol++;
//e2 = GetHuffEntry (HuffEnv.top-1);
//HuffEnv.Entry[HuffEnv.top].child[1] = HuffEnv.top-1;
//HuffEnv.Entry[HuffEnv.top].child[0] = HuffEnv.top-2;
//top->child[1] = HuffEnv.top-1;
//top->child[0] = HuffEnv.top-2;
//top->occur++; bottomright->occur = 1;
//++top->occur;bottomright->occur = 1;
//top->occur = bottomright->occur =
e1->bit = 1;
//bottomleft->bit = 0;
bottomleft->parent = e1->parent = HuffEnv.top;
//bottomright->bit = 1;
HuffEnv.entrynum[c] = HuffEnv.top-1;
e1->c = //HuffEnv.top-1;
c;
HuffEnv.top- = 2;
//relits| = 1<<(c>>4);
--numhuffs;
//relits| = 1<<(c>>4);
}

static unsigned __fastcall__ GetHuffGreatestInBlock (unsigned c)
{
//register unsigned k;
//struct HuffEnv_Entry* h = GetHuffEntry (512);
unsigned j = (GetHuffEntry (c))->occur;
//register int c2 = GetHuffEntry (c)->c;
//register int c2 = e3->c;
for (tmpcptr = 512; 1; --tmpcptr){

//if (HuffEnv.Entry[tmpcptr].occur == j) k = tmpcptr;
//if (GetHuffEntry (tmpcptr)->occur == j) k = tmpcptr;
if (GetHuffEntry (tmpcptr)->occur == j /*&& c2^h->c> = 0*/) return tmpcptr;
//--h;
} //return c;

}

//void UpdateHuffCode (unsigned char c)
void __fastcall__ UpdateHuffCode (void)
{
//unsigned z = 0;
unsigned n, ch = HuffEnv.entrynum[c], x, y;
//static struct HuffEnv_Entry* e1;
HuffEnv.Entry[512].parent = -1;
//do {
//backcol = 8;
while (ch<512) {
//printc ('.'); if (z>10) {
// /*cgetc();*/ z = 0;} ++z;
//n = GetHuffGreatestInBlock (HuffEnv.Entry[ch].occur);
//HuffEnv.Entry[512].parent = -1;
e1 = GetHuffEntry (n = GetHuffGreatestInBlock (ch));
e2 = GetHuffEntry (ch);
//e1 = GetHuffEntry (n);
//if (n> = 512) break;
if (n! = ch &&
e2->parent! = n){
if (/*HuffEnv.Entry[n].c*/((int)e2->c)> = 0) {
HuffEnv.entrynum[e2->c] = n;
}
if (((int)e1->c)> = 0) {
HuffEnv.entrynum[e1->c] = ch;
}
__asm__ (
"\tldy\t#3\n"
"\t@a01:\n"
"\tlda\t(_e1),y\n"
"\tpha\n"
"\tlda\t(_e2),y\n"
"\tsta\t(_e1),y\n"
"\tpla\n"
"\tsta\t(_e2),y\n"
"\tiny\n"
"\tcpy\t#7\n"
"\tbcc\t@a01\n"
);
}
++e2->occur;
ch = e2->parent;//HuffEnv.Entry[ch].parent;
} ++HuffEnv.Entry[512].occur;
}
--------------------


Posted By

Harry Potter
on 2022-06-02
09:43:41
 Re: cc65 compressor too slow: help needed optimizing

The function GetHuffEntry() returns the address of the given entry in the Huffman Code table. I increased the speed by, in the GetHuffGreatestInBlock() function, calling the function once in the beginning and decrementing each loop. It helped. Now, it doesn't pause anymore. happy

BTW, I was asking for help optimizing, but I have some other optimization techniques. They are at https://sourceforge.net/projects/cc65extra/files/. Try them out, and thank you for your insight. happy



Back to topReply to this topic


Copyright © Plus/4 World Team, 2001-2024