fast block search part 2 - still ALPHA

now tested... it works but size is not optimal
2025-04-07 13:17:27 -05:00 · 2025-04-07 13:17:27 -05:00 · fc0a7a5e17
parent 2fd6fa1a87
commit fc0a7a5e17
1 changed files with 191 additions and 275 deletions
--- a/src/engine/cmdStreamOps.cpp
+++ b/src/engine/cmdStreamOps.cpp
@ -601,7 +601,7 @@ void reloc(unsigned char* buf, size_t len, unsigned int sourceAddr, unsigned int
  }
 }
-SafeWriter* stripNops(SafeWriter* s, unsigned char* speedDial) {
+SafeWriter* stripNops(SafeWriter* s) {
  std::unordered_map<unsigned int,unsigned int> addrTable;
  SafeWriter* oldStream=s;
  unsigned char* buf=oldStream->getFinalBuf();
@ -666,306 +666,222 @@ struct BlockMatch {
 #define OVERLAPS(a1,a2,b1,b2) ((b1)<(a2) && (b2)>(a1))
-SafeWriter* findSubBlocks(SafeWriter* stream, std::vector<SafeWriter*>& subBlocks, unsigned char* speedDial) {
+// TODO:
 // - check whether a block consists only of calls
 // - see if we can optimize better
 SafeWriter* findSubBlocks(SafeWriter* stream, std::vector<SafeWriter*>& subBlocks) {
  unsigned char* buf=stream->getFinalBuf();
  size_t matchSize=48;
  std::vector<BlockMatch> matches;
-  // fast match algorithm
+  // repeat until we run out of matches
-  // search for small matches, and then find bigger ones
+  while (true) {
-  for (size_t i=0; i<stream->size(); i+=8) {
+    matches.clear();
-    for (size_t j=i+matchSize; j<stream->size(); j+=8) {
+
-      if (memcmp(&buf[i],&buf[j],matchSize)==0) {
+    // fast match algorithm
-        // store this match for later
+    // search for small matches, and then find bigger ones
-        matches.push_back(BlockMatch(i,j,matchSize));
+    for (size_t i=0; i<stream->size(); i+=8) {
      for (size_t j=i+matchSize; j<stream->size(); j+=8) {
        if (memcmp(&buf[i],&buf[j],matchSize)==0) {
          // store this match for later
          matches.push_back(BlockMatch(i,j,matchSize));
        }
      }
    }
  }
-  logD("%d candidates",(int)matches.size());
+    logD("%d candidates",(int)matches.size());
-  // quit if there isn't anything
+    // quit if there isn't anything
-  if (matches.empty()) return stream;
+    if (matches.empty()) return stream;
-  // search for bigger matches
+    // search for bigger matches
-  bool wantMore=true;
+    bool wantMore=true;
-  do {
+    do {
-    size_t matchCount=0;
+      size_t matchCount=0;
-    wantMore=false;
+      wantMore=false;
-    matchSize+=8;
+      matchSize+=8;
-    for (size_t i=0; i<matches.size(); i++) {
+      for (size_t i=0; i<matches.size(); i++) {
-      BlockMatch& b=matches[i];
+        BlockMatch& b=matches[i];
-      // don't do anything if this match is done
+        // don't do anything if this match is done
-      if (b.done) continue;
+        if (b.done) continue;
-      // stop if this match is near the edge
+        // stop if this match is near the edge
-      if ((b.orig+matchSize)>stream->size() || (b.block+matchSize)>stream->size()) {
+        if ((b.orig+matchSize)>stream->size() || (b.block+matchSize)>stream->size()) {
-        b.done=true;
+          b.done=true;
-        continue;
+          continue;
        }
        // check
        if (memcmp(&buf[b.orig],&buf[b.block],matchSize)==0) {
          // this match may be bigger
          b.len=matchSize;
          wantMore=true;
          matchCount++;
        } else {
          // this is the max size
          b.done=true;
        }
      }
      //logV("size %d: %d matches",(int)matchSize,(int)matchCount);
    } while (wantMore);
-      // check
+    // first stage done
-      if (memcmp(&buf[b.orig],&buf[b.block],matchSize)==0) {
+    // set done to false unless:
-        // this match may be bigger
+    // - this match overlaps with itself
-        b.len=matchSize;
+    // - this block only consists of calls
-        wantMore=true;
+    size_t nonOverlapCount=0;
        matchCount++;
      } else {
        // this is the max size
        b.done=true;
      }
    }
    //logV("size %d: %d matches",(int)matchSize,(int)matchCount);
  } while (wantMore);
  // first stage done
  // set done to false unless this match overlaps with itself
  size_t nonOverlapCount=0;
  for (BlockMatch& i: matches) {
    i.done=false;
    if (OVERLAPS(i.orig,i.orig+i.len,i.block,i.block+i.len)) {
      // self-overlapping
      i.done=true;
    } else {
      nonOverlapCount++;
    }
  }
  logD("%d non-overlapping candidates",(int)nonOverlapCount);
  // quit if there isn't anything
  if (!nonOverlapCount) return stream;
  // work on largest matches
  // progress to smaller ones until we run out of them
  logD("largest match: %d",(int)matchSize);
  std::vector<BlockMatch> workMatches;
  while (matchSize>=48) {
    workMatches.clear();
    // find matches with matching size
    for (BlockMatch& i: matches) {
-      if (i.len==matchSize) {
+      i.done=false;
-        // mark it as done and push it
+      if (OVERLAPS(i.orig,i.orig+i.len,i.block,i.block+i.len)) {
-        workMatches.push_back(i);
+        // self-overlapping
        i.done=true;
-      }
+      } else {
-    }
+        nonOverlapCount++;
-
+        /*
-    // make sub-blocks
+        bool onlyCalls=true;
-    size_t lastOrig=SIZE_MAX;
+        for (size_t j=i.orig; j<i.orig+i.len; j+=8) {
-    size_t subBlockID=subBlocks.size();
+          if (buf[j]!=0xf4) {
-    for (BlockMatch& i: workMatches) {
+            onlyCalls=false;
-      // skip invalid matches (yes, this can happen)
+            break;
      if (i.done) continue;
      // create new sub-block if necessary
      if (i.orig!=lastOrig) {
        subBlockID=subBlocks.size();
        // isolate this sub-block
        SafeWriter* newBlock=new SafeWriter;
        newBlock->init();
        newBlock->write(&buf[i.orig],i.len);
        newBlock->writeC(0xf9); // ret
        // padding
        newBlock->writeC(0);
        newBlock->writeC(0);
        newBlock->writeC(0);
        newBlock->writeC(0);
        newBlock->writeC(0);
        newBlock->writeC(0);
        newBlock->writeC(0);
        subBlocks.push_back(newBlock);
        lastOrig=i.orig;
        // insert call on the original block
        buf[i.orig]=0xf4;
        buf[i.orig+1]=subBlockID&0xff;
        buf[i.orig+2]=(subBlockID>>8)&0xff;
        buf[i.orig+3]=(subBlockID>>16)&0xff;
        buf[i.orig+4]=(subBlockID>>24)&0xff;
        buf[i.orig+5]=0;
        buf[i.orig+6]=0;
        buf[i.orig+7]=0;
        // replace the rest with nop
        for (size_t j=i.orig+8; j<i.orig+i.len; j++) {
          buf[j]=0xf1;
        }
      }
      // set match to the last sub-block
      buf[i.block]=0xf4;
      buf[i.block+1]=subBlockID&0xff;
      buf[i.block+2]=(subBlockID>>8)&0xff;
      buf[i.block+3]=(subBlockID>>16)&0xff;
      buf[i.block+4]=(subBlockID>>24)&0xff;
      buf[i.block+5]=0;
      buf[i.block+6]=0;
      buf[i.block+7]=0;
      // replace the rest with nop
      for (size_t j=i.block+8; j<i.block+i.len; j++) {
        buf[j]=0xf1;
      }
      // invalidate overlapping work matches
      for (BlockMatch& j: workMatches) {
        if (j.orig!=i.orig) {
          j.done=true;
        }
        if (OVERLAPS(i.block,i.block+i.len,j.block,j.block+j.len)) {
          j.done=true;
        }
      }
      // invalidate overlapping matches
      for (BlockMatch& j: matches) {
        if (OVERLAPS(i.orig,i.orig+i.len,j.orig,j.orig+j.len) ||
            OVERLAPS(i.orig,i.orig+i.len,j.block,j.block+j.len) ||
            OVERLAPS(i.block,i.block+i.len,j.orig,j.orig+j.len) ||
            OVERLAPS(i.block,i.block+i.len,j.block,j.block+j.len)) {
          j.done=true;
        }
      }
    }
    // try with a smaller size
    matchSize=0;
    for (BlockMatch& i: matches) {
      if (i.done) continue;
      if (i.len>matchSize) matchSize=i.len;
    }
    if (matchSize>=48) {
      logV("trying next size %d",matchSize);
    }
  }
  logV("done!");
  // remove nop's
  stream=stripNops(stream,speedDial);
  /*
  for (size_t groupSize=stream->size()>>1; groupSize>=48; groupSize-=8) {
    bool foundSomething=false;
    logV("...try size %d",groupSize);
    for (size_t searchPos=0; (searchPos+groupSize)<stream->size();) {
      const unsigned char* group=&buf[searchPos];
      size_t groupLen=0;
      size_t groupInsCount=0;
      size_t subBlockID=subBlocks.size();
      bool haveSub=false;
      bool onlyCalls=true;
      // register this block
      for (size_t i=0; i<groupSize && i<stream->size(); i+=8) {
        if (buf[searchPos+i]!=0xf4) onlyCalls=false;
        if (groupLen+8>groupSize) break;
        groupLen+=8;
        groupInsCount++;
      }
      // don't do anything if we don't have a block large enough
      if (groupLen<24) {
        searchPos+=8;
        continue;
      }
      // don't do anything if this is just one or two commands
      // TODO: this is a duplicate of the previous statement now that all commands are 8 bytes long
      if (groupInsCount<3) {
        searchPos+=8;
        continue;
      }
      // don't do anything if this block only consists of calls
      if (onlyCalls) {
        logW("nothing but calls.");
        searchPos+=8;
        continue;
      }
      // find identical blocks
      for (size_t i=searchPos+groupLen; i+groupLen<stream->size();) {
        // compare next block to group
        if (memcmp(&buf[i],group,groupLen)==0) {
          // we have a sub-block
          if (!haveSub) {
            // isolate this sub-block
            SafeWriter* newBlock=new SafeWriter;
            newBlock->init();
            newBlock->write(group,groupLen);
            newBlock->writeC(0xf9); // ret
            // padding
            newBlock->writeC(0);
            newBlock->writeC(0);
            newBlock->writeC(0);
            newBlock->writeC(0);
            newBlock->writeC(0);
            newBlock->writeC(0);
            newBlock->writeC(0);
            subBlocks.push_back(newBlock);
            haveSub=true;
          logD("- SUB %x (size %d):",searchPos,groupLen);
          }
-          logD("  - %x",i);
+        }
-          // insert call
+        if (onlyCalls) {
-          buf[i]=0xf4;
+          i.done=true;
-          buf[i+1]=subBlockID&0xff;
+        } else {
-          buf[i+2]=(subBlockID>>8)&0xff;
+          nonOverlapCount++;
-          buf[i+3]=(subBlockID>>16)&0xff;
+        }
-          buf[i+4]=(subBlockID>>24)&0xff;
+        */
-          buf[i+5]=0;
+      }
-          buf[i+6]=0;
+    }
-          buf[i+7]=0;
+
    logD("%d good candidates",(int)nonOverlapCount);
    // quit if there isn't anything
    if (!nonOverlapCount) return stream;
    // work on largest matches
    // progress to smaller ones until we run out of them
    logD("largest match: %d",(int)matchSize);
    std::vector<BlockMatch> workMatches;
    bool newBlocks=false;
    while (matchSize>=48) {
      workMatches.clear();
      // find matches with matching size
      for (BlockMatch& i: matches) {
        if (i.len==matchSize) {
          // mark it as done and push it
          workMatches.push_back(i);
          i.done=true;
        }
      }
      // make sub-blocks
      size_t lastOrig=SIZE_MAX;
      size_t subBlockID=subBlocks.size();
      for (BlockMatch& i: workMatches) {
        // skip invalid matches (yes, this can happen)
        if (i.done) continue;
        // create new sub-block if necessary
        if (i.orig!=lastOrig) {
          subBlockID=subBlocks.size();
          newBlocks=true;
          logV("new sub-block %d",(int)subBlockID);
          // isolate this sub-block
          SafeWriter* newBlock=new SafeWriter;
          newBlock->init();
          newBlock->write(&buf[i.orig],i.len);
          newBlock->writeC(0xf9); // ret
          // padding
          newBlock->writeC(0);
          newBlock->writeC(0);
          newBlock->writeC(0);
          newBlock->writeC(0);
          newBlock->writeC(0);
          newBlock->writeC(0);
          newBlock->writeC(0);
          subBlocks.push_back(newBlock);
          lastOrig=i.orig;
          // insert call on the original block
          buf[i.orig]=0xf4;
          buf[i.orig+1]=subBlockID&0xff;
          buf[i.orig+2]=(subBlockID>>8)&0xff;
          buf[i.orig+3]=(subBlockID>>16)&0xff;
          buf[i.orig+4]=(subBlockID>>24)&0xff;
          buf[i.orig+5]=0;
          buf[i.orig+6]=0;
          buf[i.orig+7]=0;
          // replace the rest with nop
-          for (size_t j=i+8; j<i+groupLen; j++) {
+          for (size_t j=i.orig+8; j<i.orig+i.len; j++) {
            buf[j]=0xf1;
          }
          // continue search from end of block
          i+=groupLen;
        } else {
          // next
          i+=8;
        }
      }
-      if (haveSub) {
+        // set match to the last sub-block
-        // insert call on the original block
+        buf[i.block]=0xf4;
-        buf[searchPos]=0xf4;
+        buf[i.block+1]=subBlockID&0xff;
-        buf[searchPos+1]=subBlockID&0xff;
+        buf[i.block+2]=(subBlockID>>8)&0xff;
-        buf[searchPos+2]=(subBlockID>>8)&0xff;
+        buf[i.block+3]=(subBlockID>>16)&0xff;
-        buf[searchPos+3]=(subBlockID>>16)&0xff;
+        buf[i.block+4]=(subBlockID>>24)&0xff;
-        buf[searchPos+4]=(subBlockID>>24)&0xff;
+        buf[i.block+5]=0;
-        buf[searchPos+5]=0;
+        buf[i.block+6]=0;
-        buf[searchPos+6]=0;
+        buf[i.block+7]=0;
        buf[searchPos+7]=0;
        // replace the rest with nop
-        for (size_t j=searchPos+8; j<searchPos+groupLen; j++) {
+        for (size_t j=i.block+8; j<i.block+i.len; j++) {
          buf[j]=0xf1;
        }
-        // skip this block (it's isolated now)
+        // invalidate overlapping work matches
-        searchPos+=groupLen;
+        for (BlockMatch& j: workMatches) {
-        foundSomething=true;
+          if (j.orig!=i.orig) {
-      } else {
+            j.done=true;
-        // try again somewhere else
+          }
-        searchPos+=8;
+          if (OVERLAPS(i.block,i.block+i.len,j.block,j.block+j.len)) {
            j.done=true;
          }
        }
        // invalidate overlapping matches
        for (BlockMatch& j: matches) {
          if (OVERLAPS(i.orig,i.orig+i.len,j.orig,j.orig+j.len) ||
              OVERLAPS(i.orig,i.orig+i.len,j.block,j.block+j.len) ||
              OVERLAPS(i.block,i.block+i.len,j.orig,j.orig+j.len) ||
              OVERLAPS(i.block,i.block+i.len,j.block,j.block+j.len)) {
            j.done=true;
          }
        }
      }
      // try with a smaller size
      matchSize=0;
      for (BlockMatch& i: matches) {
        if (i.done) continue;
        if (i.len>matchSize) matchSize=i.len;
      }
      if (matchSize>=48) {
        logV("trying next size %d",matchSize);
      }
    }
-    if (foundSomething) {
+
-      stream=stripNops(stream,speedDial);
+    logV("done!");
-      buf=stream->getFinalBuf();
+
-    }
+    // get out if we haven't made any blocks
    if (!newBlocks) break;
    // remove nop's
    stream=stripNops(stream);
    buf=stream->getFinalBuf();
    logV("doing it again...");
  }
-  */
+
  return stream;
 }
@ -1343,7 +1259,7 @@ SafeWriter* DivEngine::saveCommand(DivCSProgress* progress, unsigned int disable
  // PASS 3: remove nop's
  // this includes modifying call addresses to compensate
  for (int h=0; h<chans; h++) {
-    chanStream[h]=stripNops(chanStream[h],sortedCmd);
+    chanStream[h]=stripNops(chanStream[h]);
  }
  // PASS 4: find sub-blocks and isolate them
@ -1354,7 +1270,7 @@ SafeWriter* DivEngine::saveCommand(DivCSProgress* progress, unsigned int disable
      // 6 is the minimum size that can be reliably optimized
      logI("finding sub-blocks in chan %d",h);
-      chanStream[h]=findSubBlocks(chanStream[h],subBlocks,sortedCmd);
+      chanStream[h]=findSubBlocks(chanStream[h],subBlocks);
      // find sub-blocks within sub-blocks
      size_t subBlocksLast=0;
      size_t subBlocksLen=subBlocks.size();
@ -1362,7 +1278,7 @@ SafeWriter* DivEngine::saveCommand(DivCSProgress* progress, unsigned int disable
      while (subBlocksLast!=subBlocksLen) {
        logI("got %d blocks... starting from %d",(int)subBlocksLen,(int)subBlocksLast);
        for (size_t i=subBlocksLast; i<subBlocksLen; i++) {
-          SafeWriter* newBlock=findSubBlocks(subBlocks[i],subBlocks,sortedCmd);
+          SafeWriter* newBlock=findSubBlocks(subBlocks[i],subBlocks);
          subBlocks[i]=newBlock;
        }
        subBlocksLast=subBlocksLen;
@ -1433,7 +1349,7 @@ SafeWriter* DivEngine::saveCommand(DivCSProgress* progress, unsigned int disable
  // PASS 5: remove nop's (again)
  for (int h=0; h<chans; h++) {
-    chanStream[h]=stripNops(chanStream[h],sortedCmd);
+    chanStream[h]=stripNops(chanStream[h]);
  }
  // PASS 6: pack streams