| Posted By
Harry Potter on 2022-06-01 04:51:08
| cc65 compressor too slow: help needed optimizing
Hi! I am working on a compression program for cc65. So far, it's doing well but very slowly. I am asking for help to optimize it. Following are two snippets that seem to be taking a lot of time:
-------------------- static struct { unsigned cmppos; unsigned len; //unsigned char offs[2]; unsigned ratio; } vorcuroffs, vorbestoffs;
unsigned char vo3_cur[32], vo3_best[32];
static int iscompvor3ratio () { int i, cmp=11; for (i=0; i<buflen; i++) { if (vo3_cur<i><19) cmp+=EstimDist (vo3_cur<i>, 19); else cmp+=GetHuffLen(&cin<i>, 1); } return cmp; }
extern unsigned char *inblockpos, *testblockpos; #pragma zpsym ("inblockpos") #pragma zpsym ("testblockpos")
unsigned char __fastcall__ iscompvor3a (void);
unsigned char iscompvor3 (void) { int testpos=CPos-1, i, j, k; int first=CPos>=500?CPos-500:0; unsigned len, len2, vorlen; unsigned unlen=0; int b2pos; unsigned char c; if (CPos<2) return 0; brdrcol=2; memset (&vo3_cur, 0, sizeof(vo3_cur)); memset (&vo3_best, 0, sizeof(vo3_best)); vorlen=0; vorbestoffs.ratio=GetHuffLen (cin, buflen)+10; for (; testpos>=first; testpos--) { //inblockpos=&InBuffer[CPos]; testblockpos = &InBuffer[testpos]; //for (j=0; j<1; j++){ if (iscompvor3a ()==0) continue; vorcuroffs.ratio = iscompvor3ratio(); if (vorcuroffs.ratio < vorbestoffs.ratio) { //vorbestoffs.cmppos=CPos-testpos-1; vorbestoffs.cmppos=testpos; vorbestoffs.ratio = vorcuroffs.ratio; //memcpy (&vo3_best, &vo3_cur, sizeof(vo3_cur)); __asm__ ( "\tldx\t_buflen\n" "\tdex\n" "@a01:\n" "\tlda\t_vo3_cur,x\n" "\tsta\t_vo3_best,x\n" "\tdex\n" "\tbpl\t@a01\n" ); vorlen=1; } //} } brdrcol=0; vorbestoffs.cmppos = CPos-vorbestoffs.cmppos-1; return vorlen; } ------------------- _iscompvor3a: ldx #0 ;sta tmp1 ldy _buflen dey @a01: lda (_cin),y clc adc #9 sec sbc (_testblockpos),y sta _vo3_cur,y cmp #19 bcs *+3 inx dey bpl @a01 txa ldx #0 rts ---------------------
Any help would be appreciated.
|
|
Posted By
Mad on 2022-05-31 15:29:24
| Re: cc65 compressor too slow: help needed optimizing
Maybe you have to go through your own process in optimizing stuff.. Some tips for that anyways:
- Here is a list of the cycle count of every asm instruction: http://www.oxyron.de/html/opcodes02.html - You can use illegal opcodes - especially lax is for me sometimes worth some gold (it loads accu and x at once with a value), you can see in this list that some forms of lax seem to be not stable: http://www.oxyron.de/html/opcodes02.html - Use self modifying code where ever you can - Unroll loops, most of the time you can even add a lot of functionality whilst also getting a lot faster with this - conditional branches like beq,bne and so on use 3 cycles if they jump, if they don't jump they use 2 cycles. That means sometimes it's worth to take the inverse jump (bne instead of beq for instance) - don't use jsr in timing critical code.. Jsr takes 6 cycles and rts takes 6 cycles,too which is a lot to bear (12 cycles :/). - use tables wherever possible (if you have the memory precalc stuff) you even can use a table instead of an add if it uses lower cycles (or if you want to keep the carry bit intact). - there is a lot more, but mainly you have to got through your own process.. Optimizing stuff is where the fun starts and most of the demoscene people rely solely on the speed of there routines.. So to say be a good optimizer for your self, it really will help you a lot of times on this platform.
good luck with your project..
Ok, some simple optimization anyways: This one: "@a01:\n" "\tlda\t_vo3_cur,x\n" "\tsta\t_vo3_best,x\n" "\tdex\n" "\tbpl\t@a01\n"
Maybe it's worth to do "fragment copies" here if it's a long region to copy:
inx ; no bpl used this way (maybe also got to copy more than 128 bytes by this) beq .error ; copy extactly 256 bytes not possible with this technique I think cpx #$10 bcc .no16loop ; maybe bmi if you have signed values in x ; implicit sec for sbc no need to set it (if bmi it would need a sec here) .a "\tlda\t_vo3_cur-0-1,x\n" ; -1 because of inx "\tsta\t_vo3_best-0-1,x\n" "\tlda\t_vo3_cur-1-1,x\n" "\tsta\t_vo3_best-1-1,x\n" "\tlda\t_vo3_cur-2-1,x\n" "\tsta\t_vo3_best-2-1,x\n" "\tlda\t_vo3_cur-3-1,x\n" "\tsta\t_vo3_best-3-1,x\n" "\tlda\t_vo3_cur-4-1,x\n" "\tsta\t_vo3_best-4-1,x\n" "\tlda\t_vo3_cur-5-1,x\n" "\tsta\t_vo3_best-5-1,x\n" .... "\tlda\t_vo3_cur-15-1,x\n" "\tsta\t_vo3_best-15-1,x\n" txa sbc #$10 tax cpx #$10 bcs .a ; implicit sec for sbc (if you use a bmi you need some sec somewhere before the sbc) .no16loop
cpx #$00 ; got all already copied by the 16er loop? beq .no .b "\tlda\t_vo3_cur-0-1,x\n" "\tsta\t_vo3_best-0-1,x\n" dex bne .b .no
etc..
You spare almost 16 times the: dex bpl .b which is 16 * (2+3) = 80 - 11 = around 69 cycles for every 16er loop spared
I didn't test this code, maybe it's malicious just that you can get the idea, of unrolling atleast parts of this loop..
Another technique would be:
loop: "\ttlda\t_vo3_cur+0\n" "\tsta\t_vo3_best+0\n" "\ttlda\t_vo3_cur+1\n" "\tsta\t_vo3_best+1\n" "\ttlda\t_vo3_cur+2\n" "\tsta\t_vo3_best+2\n" "\ttlda\t_vo3_cur+3\n" "\tsta\t_vo3_best+3\n" "\ttlda\t_vo3_cur+4\n" "\tsta\t_vo3_best+4\n" .. "\ttlda\t_vo3_cur+255\n" "\tsta\t_vo3_best+255\n" rts ; if you want to copy all 256 values this byte needs to be here
put rts into loop at position "loop + (x+1) * (3*2)" # (3 bytes for lda and 3 bytes for sta), you can use a table (actuall two tables, lo and hi) for the multiplication jsr loop remove rts from loop
(you just would have to insert an rts by selfmodifying code there, where you want the (completely unrolled) loop to stop. But be aware that you have to remove the rts again afterwards.)
Maybe this doesn't help, but you asked for some suggestions.. I don't know what you code should do, I even don't know if this loop is a problem at all. This loop needs to copy more than let's say 32 bytes to get the optimizations working.. If it copies around 16 bytes everytime these optimizations are senseless..
For small loops you could use
lda postable,x ; this is (20-(x + 1))*(3*2) sta .modify + 1 lda #$00 .modify beq loop2 ; this jumps into the unrolled code section at the right position no need for a hi and lo table this way, the unrolled code must be inverse this way and end with an rts like this:
loop2 "\ttlda\t_vo3_cur+19\n" "\tsta\t_vo3_best+19\n" ... "\ttlda\t_vo3_cur+0\n" "\tsta\t_vo3_best+0\n" rts
but this would only copy max 20 bytes
|
|
Posted By
Harry Potter on 2022-05-31 14:47:33
| Re: cc65 compressor too slow: help needed optimizing
I thank you for your advice. I can inline some code for LZ77, but it wouldn't help much here.
|
|
Posted By
Mad on 2022-05-31 15:16:29
| Re: cc65 compressor too slow: help needed optimizing
I just updated the comment above with some lengthy unroll stuff... Good luck with optimizing..
|
|
Posted By
Harry Potter on 2022-05-31 15:24:48
| Re: cc65 compressor too slow: help needed optimizing
Again, I thank you for your help. I can't do the loop unrolling, as I need to copy a certain number of bytes, and that number changes.
|
|
Posted By
Mad on 2022-06-01 08:28:39
| Re: cc65 compressor too slow: help needed optimizing
I cannot help you.. All three suggestions above can copy changing numbers of bytes (the amount is in the x register at start).. But anyhow have fun.. lda #$00 ; somewhere at program init sta $f0 ; somewhere at program init
copyBytes:
lda rtstablehi,x sta $f1 ldy rtstablelo,x lda #OPCODE_RTS sta ($f0),y ; places an rts in the unrolled part jsr loop lda #OPCODE_LDA_ABS sta ($f0),y ; removes the rts in the unrolled part again rts
unrolledPart "\ttlda\t_vo3_cur+0\n" "\tsta\t_vo3_best+0\n" "\ttlda\t_vo3_cur+1\n" "\tsta\t_vo3_best+1\n" "\ttlda\t_vo3_cur+2\n" "\tsta\t_vo3_best+2\n" "\ttlda\t_vo3_cur+3\n" "\tsta\t_vo3_best+3\n" "\ttlda\t_vo3_cur+4\n" "\tsta\t_vo3_best+4\n" .. "\ttlda\t_vo3_cur+255\n" "\tsta\t_vo3_best+255\n" rts ; if you want to copy all 256 values this byte needs to be here
Just to clarify that..
edit: BTW sorry, you do a compression programm, unrolling maybe is not an option then (because of program size :) )..
edit2 :) :
lda jmptablo,x sta j + 1 lda jmptabhi,x sta j + 2 j: jmp unrolledPart unrolledPart: "\ttlda\t_vo3_cur+255\n" "\tsta\t_vo3_best+255\n" "\ttlda\t_vo3_cur+254\n" "\tsta\t_vo3_best+254\n" .. "\ttlda\t_vo3_cur+0\n" "\tsta\t_vo3_best+0\n"
|
|
Posted By
Harry Potter on 2022-05-31 15:40:18
| Re: cc65 compressor too slow: help needed optimizing
Thank you.
|
|
Posted By
Harry Potter on 2022-05-31 16:52:57
| Re: cc65 compressor too slow: help needed optimizing
I managed to make it slightly faster by moving the testblockpos definition outside the loop and decrementing it each loop.
|
|
Posted By
Harry Potter on 2022-06-01 14:54:44
| Re: cc65 compressor too slow: help needed optimizing
I'm sorry, but that I wasted your time. I deprecated the technique, because I found a technique that's both better and faster. It still pauses for split seconds at a time, though, and I suspect Adaptive Huffman Codes to be the culprit. Following are some of the functions:
-------------------- void __fastcall__ AddHuffNYT (void) { // struct HuffEnv_Entry//* top = GetHuffEntry (HuffEnv.top), //&HuffEnv.Entry[HuffEnv.top], // //*bottomleft = GetHuffEntry (HuffEnv.top-2), //&HuffEnv.Entry[HuffEnv.top-2], // *bottomright = GetHuffEntry (HuffEnv.top-1); //&HuffEnv.Entry[HuffEnv.top-1]; e1 = GetHuffEntry (HuffEnv.top-1); //bottomleft = GetHuffEntry (HuffEnv.top-2); bottomleft = e1; --bottomleft; //backcol++; //e2 = GetHuffEntry (HuffEnv.top-1); //HuffEnv.Entry[HuffEnv.top].child[1] = HuffEnv.top-1; //HuffEnv.Entry[HuffEnv.top].child[0] = HuffEnv.top-2; //top->child[1] = HuffEnv.top-1; //top->child[0] = HuffEnv.top-2; //top->occur++; bottomright->occur = 1; //++top->occur;bottomright->occur = 1; //top->occur = bottomright->occur = e1->bit = 1; //bottomleft->bit = 0; bottomleft->parent = e1->parent = HuffEnv.top; //bottomright->bit = 1; HuffEnv.entrynum[c] = HuffEnv.top-1; e1->c = //HuffEnv.top-1; c; HuffEnv.top- = 2; //relits| = 1<<(c>>4); --numhuffs; //relits| = 1<<(c>>4); }
static unsigned __fastcall__ GetHuffGreatestInBlock (unsigned c) { //register unsigned k; //struct HuffEnv_Entry* h = GetHuffEntry (512); unsigned j = (GetHuffEntry (c))->occur; //register int c2 = GetHuffEntry (c)->c; //register int c2 = e3->c; for (tmpcptr = 512; 1; --tmpcptr){ //if (HuffEnv.Entry[tmpcptr].occur == j) k = tmpcptr; //if (GetHuffEntry (tmpcptr)->occur == j) k = tmpcptr; if (GetHuffEntry (tmpcptr)->occur == j /*&& c2^h->c> = 0*/) return tmpcptr; //--h; } //return c;
}
//void UpdateHuffCode (unsigned char c) void __fastcall__ UpdateHuffCode (void) { //unsigned z = 0; unsigned n, ch = HuffEnv.entrynum[c], x, y; //static struct HuffEnv_Entry* e1; HuffEnv.Entry[512].parent = -1; //do { //backcol = 8; while (ch<512) { //printc ('.'); if (z>10) { // /*cgetc();*/ z = 0;} ++z; //n = GetHuffGreatestInBlock (HuffEnv.Entry[ch].occur); //HuffEnv.Entry[512].parent = -1; e1 = GetHuffEntry (n = GetHuffGreatestInBlock (ch)); e2 = GetHuffEntry (ch); //e1 = GetHuffEntry (n); //if (n> = 512) break; if (n! = ch && e2->parent! = n){ if (/*HuffEnv.Entry[n].c*/((int)e2->c)> = 0) { HuffEnv.entrynum[e2->c] = n; } if (((int)e1->c)> = 0) { HuffEnv.entrynum[e1->c] = ch; } __asm__ ( "\tldy\t#3\n" "\t@a01:\n" "\tlda\t(_e1),y\n" "\tpha\n" "\tlda\t(_e2),y\n" "\tsta\t(_e1),y\n" "\tpla\n" "\tsta\t(_e2),y\n" "\tiny\n" "\tcpy\t#7\n" "\tbcc\t@a01\n" ); } ++e2->occur; ch = e2->parent;//HuffEnv.Entry[ch].parent; } ++HuffEnv.Entry[512].occur; } --------------------
|
|
Posted By
Harry Potter on 2022-06-02 09:43:41
| Re: cc65 compressor too slow: help needed optimizing
The function GetHuffEntry() returns the address of the given entry in the Huffman Code table. I increased the speed by, in the GetHuffGreatestInBlock() function, calling the function once in the beginning and decrementing each loop. It helped. Now, it doesn't pause anymore.
BTW, I was asking for help optimizing, but I have some other optimization techniques. They are at https://sourceforge.net/projects/cc65extra/files/. Try them out, and thank you for your insight.
|
|
| |
Copyright © Plus/4 World Team, 2001-2025. Support Plus/4 World on Patreon |