fast block search part 2 - still ALPHA

now tested... it works but size is not optimal
This commit is contained in:
tildearrow 2025-04-07 13:17:27 -05:00
parent 2fd6fa1a87
commit fc0a7a5e17

View file

@ -601,7 +601,7 @@ void reloc(unsigned char* buf, size_t len, unsigned int sourceAddr, unsigned int
} }
} }
SafeWriter* stripNops(SafeWriter* s, unsigned char* speedDial) { SafeWriter* stripNops(SafeWriter* s) {
std::unordered_map<unsigned int,unsigned int> addrTable; std::unordered_map<unsigned int,unsigned int> addrTable;
SafeWriter* oldStream=s; SafeWriter* oldStream=s;
unsigned char* buf=oldStream->getFinalBuf(); unsigned char* buf=oldStream->getFinalBuf();
@ -666,306 +666,222 @@ struct BlockMatch {
#define OVERLAPS(a1,a2,b1,b2) ((b1)<(a2) && (b2)>(a1)) #define OVERLAPS(a1,a2,b1,b2) ((b1)<(a2) && (b2)>(a1))
SafeWriter* findSubBlocks(SafeWriter* stream, std::vector<SafeWriter*>& subBlocks, unsigned char* speedDial) { // TODO:
// - check whether a block consists only of calls
// - see if we can optimize better
SafeWriter* findSubBlocks(SafeWriter* stream, std::vector<SafeWriter*>& subBlocks) {
unsigned char* buf=stream->getFinalBuf(); unsigned char* buf=stream->getFinalBuf();
size_t matchSize=48; size_t matchSize=48;
std::vector<BlockMatch> matches; std::vector<BlockMatch> matches;
// fast match algorithm // repeat until we run out of matches
// search for small matches, and then find bigger ones while (true) {
for (size_t i=0; i<stream->size(); i+=8) { matches.clear();
for (size_t j=i+matchSize; j<stream->size(); j+=8) {
if (memcmp(&buf[i],&buf[j],matchSize)==0) { // fast match algorithm
// store this match for later // search for small matches, and then find bigger ones
matches.push_back(BlockMatch(i,j,matchSize)); for (size_t i=0; i<stream->size(); i+=8) {
for (size_t j=i+matchSize; j<stream->size(); j+=8) {
if (memcmp(&buf[i],&buf[j],matchSize)==0) {
// store this match for later
matches.push_back(BlockMatch(i,j,matchSize));
}
} }
} }
}
logD("%d candidates",(int)matches.size()); logD("%d candidates",(int)matches.size());
// quit if there isn't anything // quit if there isn't anything
if (matches.empty()) return stream; if (matches.empty()) return stream;
// search for bigger matches // search for bigger matches
bool wantMore=true; bool wantMore=true;
do { do {
size_t matchCount=0; size_t matchCount=0;
wantMore=false; wantMore=false;
matchSize+=8; matchSize+=8;
for (size_t i=0; i<matches.size(); i++) { for (size_t i=0; i<matches.size(); i++) {
BlockMatch& b=matches[i]; BlockMatch& b=matches[i];
// don't do anything if this match is done // don't do anything if this match is done
if (b.done) continue; if (b.done) continue;
// stop if this match is near the edge // stop if this match is near the edge
if ((b.orig+matchSize)>stream->size() || (b.block+matchSize)>stream->size()) { if ((b.orig+matchSize)>stream->size() || (b.block+matchSize)>stream->size()) {
b.done=true; b.done=true;
continue; continue;
}
// check
if (memcmp(&buf[b.orig],&buf[b.block],matchSize)==0) {
// this match may be bigger
b.len=matchSize;
wantMore=true;
matchCount++;
} else {
// this is the max size
b.done=true;
}
} }
//logV("size %d: %d matches",(int)matchSize,(int)matchCount);
} while (wantMore);
// check // first stage done
if (memcmp(&buf[b.orig],&buf[b.block],matchSize)==0) { // set done to false unless:
// this match may be bigger // - this match overlaps with itself
b.len=matchSize; // - this block only consists of calls
wantMore=true; size_t nonOverlapCount=0;
matchCount++;
} else {
// this is the max size
b.done=true;
}
}
//logV("size %d: %d matches",(int)matchSize,(int)matchCount);
} while (wantMore);
// first stage done
// set done to false unless this match overlaps with itself
size_t nonOverlapCount=0;
for (BlockMatch& i: matches) {
i.done=false;
if (OVERLAPS(i.orig,i.orig+i.len,i.block,i.block+i.len)) {
// self-overlapping
i.done=true;
} else {
nonOverlapCount++;
}
}
logD("%d non-overlapping candidates",(int)nonOverlapCount);
// quit if there isn't anything
if (!nonOverlapCount) return stream;
// work on largest matches
// progress to smaller ones until we run out of them
logD("largest match: %d",(int)matchSize);
std::vector<BlockMatch> workMatches;
while (matchSize>=48) {
workMatches.clear();
// find matches with matching size
for (BlockMatch& i: matches) { for (BlockMatch& i: matches) {
if (i.len==matchSize) { i.done=false;
// mark it as done and push it if (OVERLAPS(i.orig,i.orig+i.len,i.block,i.block+i.len)) {
workMatches.push_back(i); // self-overlapping
i.done=true; i.done=true;
} } else {
} nonOverlapCount++;
/*
// make sub-blocks bool onlyCalls=true;
size_t lastOrig=SIZE_MAX; for (size_t j=i.orig; j<i.orig+i.len; j+=8) {
size_t subBlockID=subBlocks.size(); if (buf[j]!=0xf4) {
for (BlockMatch& i: workMatches) { onlyCalls=false;
// skip invalid matches (yes, this can happen) break;
if (i.done) continue;
// create new sub-block if necessary
if (i.orig!=lastOrig) {
subBlockID=subBlocks.size();
// isolate this sub-block
SafeWriter* newBlock=new SafeWriter;
newBlock->init();
newBlock->write(&buf[i.orig],i.len);
newBlock->writeC(0xf9); // ret
// padding
newBlock->writeC(0);
newBlock->writeC(0);
newBlock->writeC(0);
newBlock->writeC(0);
newBlock->writeC(0);
newBlock->writeC(0);
newBlock->writeC(0);
subBlocks.push_back(newBlock);
lastOrig=i.orig;
// insert call on the original block
buf[i.orig]=0xf4;
buf[i.orig+1]=subBlockID&0xff;
buf[i.orig+2]=(subBlockID>>8)&0xff;
buf[i.orig+3]=(subBlockID>>16)&0xff;
buf[i.orig+4]=(subBlockID>>24)&0xff;
buf[i.orig+5]=0;
buf[i.orig+6]=0;
buf[i.orig+7]=0;
// replace the rest with nop
for (size_t j=i.orig+8; j<i.orig+i.len; j++) {
buf[j]=0xf1;
}
}
// set match to the last sub-block
buf[i.block]=0xf4;
buf[i.block+1]=subBlockID&0xff;
buf[i.block+2]=(subBlockID>>8)&0xff;
buf[i.block+3]=(subBlockID>>16)&0xff;
buf[i.block+4]=(subBlockID>>24)&0xff;
buf[i.block+5]=0;
buf[i.block+6]=0;
buf[i.block+7]=0;
// replace the rest with nop
for (size_t j=i.block+8; j<i.block+i.len; j++) {
buf[j]=0xf1;
}
// invalidate overlapping work matches
for (BlockMatch& j: workMatches) {
if (j.orig!=i.orig) {
j.done=true;
}
if (OVERLAPS(i.block,i.block+i.len,j.block,j.block+j.len)) {
j.done=true;
}
}
// invalidate overlapping matches
for (BlockMatch& j: matches) {
if (OVERLAPS(i.orig,i.orig+i.len,j.orig,j.orig+j.len) ||
OVERLAPS(i.orig,i.orig+i.len,j.block,j.block+j.len) ||
OVERLAPS(i.block,i.block+i.len,j.orig,j.orig+j.len) ||
OVERLAPS(i.block,i.block+i.len,j.block,j.block+j.len)) {
j.done=true;
}
}
}
// try with a smaller size
matchSize=0;
for (BlockMatch& i: matches) {
if (i.done) continue;
if (i.len>matchSize) matchSize=i.len;
}
if (matchSize>=48) {
logV("trying next size %d",matchSize);
}
}
logV("done!");
// remove nop's
stream=stripNops(stream,speedDial);
/*
for (size_t groupSize=stream->size()>>1; groupSize>=48; groupSize-=8) {
bool foundSomething=false;
logV("...try size %d",groupSize);
for (size_t searchPos=0; (searchPos+groupSize)<stream->size();) {
const unsigned char* group=&buf[searchPos];
size_t groupLen=0;
size_t groupInsCount=0;
size_t subBlockID=subBlocks.size();
bool haveSub=false;
bool onlyCalls=true;
// register this block
for (size_t i=0; i<groupSize && i<stream->size(); i+=8) {
if (buf[searchPos+i]!=0xf4) onlyCalls=false;
if (groupLen+8>groupSize) break;
groupLen+=8;
groupInsCount++;
}
// don't do anything if we don't have a block large enough
if (groupLen<24) {
searchPos+=8;
continue;
}
// don't do anything if this is just one or two commands
// TODO: this is a duplicate of the previous statement now that all commands are 8 bytes long
if (groupInsCount<3) {
searchPos+=8;
continue;
}
// don't do anything if this block only consists of calls
if (onlyCalls) {
logW("nothing but calls.");
searchPos+=8;
continue;
}
// find identical blocks
for (size_t i=searchPos+groupLen; i+groupLen<stream->size();) {
// compare next block to group
if (memcmp(&buf[i],group,groupLen)==0) {
// we have a sub-block
if (!haveSub) {
// isolate this sub-block
SafeWriter* newBlock=new SafeWriter;
newBlock->init();
newBlock->write(group,groupLen);
newBlock->writeC(0xf9); // ret
// padding
newBlock->writeC(0);
newBlock->writeC(0);
newBlock->writeC(0);
newBlock->writeC(0);
newBlock->writeC(0);
newBlock->writeC(0);
newBlock->writeC(0);
subBlocks.push_back(newBlock);
haveSub=true;
logD("- SUB %x (size %d):",searchPos,groupLen);
} }
logD(" - %x",i); }
// insert call if (onlyCalls) {
buf[i]=0xf4; i.done=true;
buf[i+1]=subBlockID&0xff; } else {
buf[i+2]=(subBlockID>>8)&0xff; nonOverlapCount++;
buf[i+3]=(subBlockID>>16)&0xff; }
buf[i+4]=(subBlockID>>24)&0xff; */
buf[i+5]=0; }
buf[i+6]=0; }
buf[i+7]=0;
logD("%d good candidates",(int)nonOverlapCount);
// quit if there isn't anything
if (!nonOverlapCount) return stream;
// work on largest matches
// progress to smaller ones until we run out of them
logD("largest match: %d",(int)matchSize);
std::vector<BlockMatch> workMatches;
bool newBlocks=false;
while (matchSize>=48) {
workMatches.clear();
// find matches with matching size
for (BlockMatch& i: matches) {
if (i.len==matchSize) {
// mark it as done and push it
workMatches.push_back(i);
i.done=true;
}
}
// make sub-blocks
size_t lastOrig=SIZE_MAX;
size_t subBlockID=subBlocks.size();
for (BlockMatch& i: workMatches) {
// skip invalid matches (yes, this can happen)
if (i.done) continue;
// create new sub-block if necessary
if (i.orig!=lastOrig) {
subBlockID=subBlocks.size();
newBlocks=true;
logV("new sub-block %d",(int)subBlockID);
// isolate this sub-block
SafeWriter* newBlock=new SafeWriter;
newBlock->init();
newBlock->write(&buf[i.orig],i.len);
newBlock->writeC(0xf9); // ret
// padding
newBlock->writeC(0);
newBlock->writeC(0);
newBlock->writeC(0);
newBlock->writeC(0);
newBlock->writeC(0);
newBlock->writeC(0);
newBlock->writeC(0);
subBlocks.push_back(newBlock);
lastOrig=i.orig;
// insert call on the original block
buf[i.orig]=0xf4;
buf[i.orig+1]=subBlockID&0xff;
buf[i.orig+2]=(subBlockID>>8)&0xff;
buf[i.orig+3]=(subBlockID>>16)&0xff;
buf[i.orig+4]=(subBlockID>>24)&0xff;
buf[i.orig+5]=0;
buf[i.orig+6]=0;
buf[i.orig+7]=0;
// replace the rest with nop // replace the rest with nop
for (size_t j=i+8; j<i+groupLen; j++) { for (size_t j=i.orig+8; j<i.orig+i.len; j++) {
buf[j]=0xf1; buf[j]=0xf1;
} }
// continue search from end of block
i+=groupLen;
} else {
// next
i+=8;
} }
}
if (haveSub) { // set match to the last sub-block
// insert call on the original block buf[i.block]=0xf4;
buf[searchPos]=0xf4; buf[i.block+1]=subBlockID&0xff;
buf[searchPos+1]=subBlockID&0xff; buf[i.block+2]=(subBlockID>>8)&0xff;
buf[searchPos+2]=(subBlockID>>8)&0xff; buf[i.block+3]=(subBlockID>>16)&0xff;
buf[searchPos+3]=(subBlockID>>16)&0xff; buf[i.block+4]=(subBlockID>>24)&0xff;
buf[searchPos+4]=(subBlockID>>24)&0xff; buf[i.block+5]=0;
buf[searchPos+5]=0; buf[i.block+6]=0;
buf[searchPos+6]=0; buf[i.block+7]=0;
buf[searchPos+7]=0;
// replace the rest with nop // replace the rest with nop
for (size_t j=searchPos+8; j<searchPos+groupLen; j++) { for (size_t j=i.block+8; j<i.block+i.len; j++) {
buf[j]=0xf1; buf[j]=0xf1;
} }
// skip this block (it's isolated now) // invalidate overlapping work matches
searchPos+=groupLen; for (BlockMatch& j: workMatches) {
foundSomething=true; if (j.orig!=i.orig) {
} else { j.done=true;
// try again somewhere else }
searchPos+=8; if (OVERLAPS(i.block,i.block+i.len,j.block,j.block+j.len)) {
j.done=true;
}
}
// invalidate overlapping matches
for (BlockMatch& j: matches) {
if (OVERLAPS(i.orig,i.orig+i.len,j.orig,j.orig+j.len) ||
OVERLAPS(i.orig,i.orig+i.len,j.block,j.block+j.len) ||
OVERLAPS(i.block,i.block+i.len,j.orig,j.orig+j.len) ||
OVERLAPS(i.block,i.block+i.len,j.block,j.block+j.len)) {
j.done=true;
}
}
}
// try with a smaller size
matchSize=0;
for (BlockMatch& i: matches) {
if (i.done) continue;
if (i.len>matchSize) matchSize=i.len;
}
if (matchSize>=48) {
logV("trying next size %d",matchSize);
} }
} }
if (foundSomething) {
stream=stripNops(stream,speedDial); logV("done!");
buf=stream->getFinalBuf();
} // get out if we haven't made any blocks
if (!newBlocks) break;
// remove nop's
stream=stripNops(stream);
buf=stream->getFinalBuf();
logV("doing it again...");
} }
*/
return stream; return stream;
} }
@ -1343,7 +1259,7 @@ SafeWriter* DivEngine::saveCommand(DivCSProgress* progress, unsigned int disable
// PASS 3: remove nop's // PASS 3: remove nop's
// this includes modifying call addresses to compensate // this includes modifying call addresses to compensate
for (int h=0; h<chans; h++) { for (int h=0; h<chans; h++) {
chanStream[h]=stripNops(chanStream[h],sortedCmd); chanStream[h]=stripNops(chanStream[h]);
} }
// PASS 4: find sub-blocks and isolate them // PASS 4: find sub-blocks and isolate them
@ -1354,7 +1270,7 @@ SafeWriter* DivEngine::saveCommand(DivCSProgress* progress, unsigned int disable
// 6 is the minimum size that can be reliably optimized // 6 is the minimum size that can be reliably optimized
logI("finding sub-blocks in chan %d",h); logI("finding sub-blocks in chan %d",h);
chanStream[h]=findSubBlocks(chanStream[h],subBlocks,sortedCmd); chanStream[h]=findSubBlocks(chanStream[h],subBlocks);
// find sub-blocks within sub-blocks // find sub-blocks within sub-blocks
size_t subBlocksLast=0; size_t subBlocksLast=0;
size_t subBlocksLen=subBlocks.size(); size_t subBlocksLen=subBlocks.size();
@ -1362,7 +1278,7 @@ SafeWriter* DivEngine::saveCommand(DivCSProgress* progress, unsigned int disable
while (subBlocksLast!=subBlocksLen) { while (subBlocksLast!=subBlocksLen) {
logI("got %d blocks... starting from %d",(int)subBlocksLen,(int)subBlocksLast); logI("got %d blocks... starting from %d",(int)subBlocksLen,(int)subBlocksLast);
for (size_t i=subBlocksLast; i<subBlocksLen; i++) { for (size_t i=subBlocksLast; i<subBlocksLen; i++) {
SafeWriter* newBlock=findSubBlocks(subBlocks[i],subBlocks,sortedCmd); SafeWriter* newBlock=findSubBlocks(subBlocks[i],subBlocks);
subBlocks[i]=newBlock; subBlocks[i]=newBlock;
} }
subBlocksLast=subBlocksLen; subBlocksLast=subBlocksLen;
@ -1433,7 +1349,7 @@ SafeWriter* DivEngine::saveCommand(DivCSProgress* progress, unsigned int disable
// PASS 5: remove nop's (again) // PASS 5: remove nop's (again)
for (int h=0; h<chans; h++) { for (int h=0; h<chans; h++) {
chanStream[h]=stripNops(chanStream[h],sortedCmd); chanStream[h]=stripNops(chanStream[h]);
} }
// PASS 6: pack streams // PASS 6: pack streams