diff --git a/src/engine/cmdStreamOps.cpp b/src/engine/cmdStreamOps.cpp
index 50443f15d..c8795f142 100644
--- a/src/engine/cmdStreamOps.cpp
+++ b/src/engine/cmdStreamOps.cpp
@@ -601,7 +601,7 @@ void reloc(unsigned char* buf, size_t len, unsigned int sourceAddr, unsigned int
   }
 }
 
-SafeWriter* stripNops(SafeWriter* s, unsigned char* speedDial) {
+SafeWriter* stripNops(SafeWriter* s) {
   std::unordered_map<unsigned int,unsigned int> addrTable;
   SafeWriter* oldStream=s;
   unsigned char* buf=oldStream->getFinalBuf();
@@ -666,306 +666,222 @@ struct BlockMatch {
 
 #define OVERLAPS(a1,a2,b1,b2) ((b1)<(a2) && (b2)>(a1))
 
-SafeWriter* findSubBlocks(SafeWriter* stream, std::vector<SafeWriter*>& subBlocks, unsigned char* speedDial) {
+// TODO:
+// - check whether a block consists only of calls
+// - see if we can optimize better
+SafeWriter* findSubBlocks(SafeWriter* stream, std::vector<SafeWriter*>& subBlocks) {
   unsigned char* buf=stream->getFinalBuf();
   size_t matchSize=48;
   std::vector<BlockMatch> matches;
 
-  // fast match algorithm
-  // search for small matches, and then find bigger ones
-  for (size_t i=0; i<stream->size(); i+=8) {
-    for (size_t j=i+matchSize; j<stream->size(); j+=8) {
-      if (memcmp(&buf[i],&buf[j],matchSize)==0) {
-        // store this match for later
-        matches.push_back(BlockMatch(i,j,matchSize));
+  // repeat until we run out of matches
+  while (true) {
+    matches.clear();
+
+    // fast match algorithm
+    // search for small matches, and then find bigger ones
+    for (size_t i=0; i<stream->size(); i+=8) {
+      for (size_t j=i+matchSize; j<stream->size(); j+=8) {
+        if (memcmp(&buf[i],&buf[j],matchSize)==0) {
+          // store this match for later
+          matches.push_back(BlockMatch(i,j,matchSize));
+        }
       }
     }
-  }
 
-  logD("%d candidates",(int)matches.size());
+    logD("%d candidates",(int)matches.size());
 
-  // quit if there isn't anything
-  if (matches.empty()) return stream;
+    // quit if there isn't anything
+    if (matches.empty()) return stream;
 
-  // search for bigger matches
-  bool wantMore=true;
-  do {
-    size_t matchCount=0;
-    wantMore=false;
-    matchSize+=8;
-    for (size_t i=0; i<matches.size(); i++) {
-      BlockMatch& b=matches[i];
+    // search for bigger matches
+    bool wantMore=true;
+    do {
+      size_t matchCount=0;
+      wantMore=false;
+      matchSize+=8;
+      for (size_t i=0; i<matches.size(); i++) {
+        BlockMatch& b=matches[i];
 
-      // don't do anything if this match is done
-      if (b.done) continue;
+        // don't do anything if this match is done
+        if (b.done) continue;
 
-      // stop if this match is near the edge
-      if ((b.orig+matchSize)>stream->size() || (b.block+matchSize)>stream->size()) {
-        b.done=true;
-        continue;
+        // stop if this match is near the edge
+        if ((b.orig+matchSize)>stream->size() || (b.block+matchSize)>stream->size()) {
+          b.done=true;
+          continue;
+        }
+
+        // check
+        if (memcmp(&buf[b.orig],&buf[b.block],matchSize)==0) {
+          // this match may be bigger
+          b.len=matchSize;
+          wantMore=true;
+          matchCount++;
+        } else {
+          // this is the max size
+          b.done=true;
+        }
       }
+      //logV("size %d: %d matches",(int)matchSize,(int)matchCount);
+    } while (wantMore);
 
-      // check
-      if (memcmp(&buf[b.orig],&buf[b.block],matchSize)==0) {
-        // this match may be bigger
-        b.len=matchSize;
-        wantMore=true;
-        matchCount++;
-      } else {
-        // this is the max size
-        b.done=true;
-      }
-    }
-    //logV("size %d: %d matches",(int)matchSize,(int)matchCount);
-  } while (wantMore);
-
-  // first stage done
-  // set done to false unless this match overlaps with itself
-  size_t nonOverlapCount=0;
-  for (BlockMatch& i: matches) {
-    i.done=false;
-    if (OVERLAPS(i.orig,i.orig+i.len,i.block,i.block+i.len)) {
-      // self-overlapping
-      i.done=true;
-    } else {
-      nonOverlapCount++;
-    }
-  }
-
-  logD("%d non-overlapping candidates",(int)nonOverlapCount);
-
-  // quit if there isn't anything
-  if (!nonOverlapCount) return stream;
-
-  // work on largest matches
-  // progress to smaller ones until we run out of them
-  logD("largest match: %d",(int)matchSize);
-
-  std::vector<BlockMatch> workMatches;
-
-  while (matchSize>=48) {
-    workMatches.clear();
-    // find matches with matching size
+    // first stage done
+    // set done to false unless:
+    // - this match overlaps with itself
+    // - this block only consists of calls
+    size_t nonOverlapCount=0;
     for (BlockMatch& i: matches) {
-      if (i.len==matchSize) {
-        // mark it as done and push it
-        workMatches.push_back(i);
+      i.done=false;
+      if (OVERLAPS(i.orig,i.orig+i.len,i.block,i.block+i.len)) {
+        // self-overlapping
         i.done=true;
-      }
-    }
-
-    // make sub-blocks
-    size_t lastOrig=SIZE_MAX;
-    size_t subBlockID=subBlocks.size();
-    for (BlockMatch& i: workMatches) {
-      // skip invalid matches (yes, this can happen)
-      if (i.done) continue;
-
-      // create new sub-block if necessary
-      if (i.orig!=lastOrig) {
-        subBlockID=subBlocks.size();
-
-        // isolate this sub-block
-        SafeWriter* newBlock=new SafeWriter;
-        newBlock->init();
-        newBlock->write(&buf[i.orig],i.len);
-        newBlock->writeC(0xf9); // ret
-        // padding
-        newBlock->writeC(0);
-        newBlock->writeC(0);
-        newBlock->writeC(0);
-        newBlock->writeC(0);
-        newBlock->writeC(0);
-        newBlock->writeC(0);
-        newBlock->writeC(0);
-        subBlocks.push_back(newBlock);
-        lastOrig=i.orig;
-
-        // insert call on the original block
-        buf[i.orig]=0xf4;
-        buf[i.orig+1]=subBlockID&0xff;
-        buf[i.orig+2]=(subBlockID>>8)&0xff;
-        buf[i.orig+3]=(subBlockID>>16)&0xff;
-        buf[i.orig+4]=(subBlockID>>24)&0xff;
-        buf[i.orig+5]=0;
-        buf[i.orig+6]=0;
-        buf[i.orig+7]=0;
-
-        // replace the rest with nop
-        for (size_t j=i.orig+8; j<i.orig+i.len; j++) {
-          buf[j]=0xf1;
-        }
-      }
-
-      // set match to the last sub-block
-      buf[i.block]=0xf4;
-      buf[i.block+1]=subBlockID&0xff;
-      buf[i.block+2]=(subBlockID>>8)&0xff;
-      buf[i.block+3]=(subBlockID>>16)&0xff;
-      buf[i.block+4]=(subBlockID>>24)&0xff;
-      buf[i.block+5]=0;
-      buf[i.block+6]=0;
-      buf[i.block+7]=0;
-
-      // replace the rest with nop
-      for (size_t j=i.block+8; j<i.block+i.len; j++) {
-        buf[j]=0xf1;
-      }
-
-      // invalidate overlapping work matches
-      for (BlockMatch& j: workMatches) {
-        if (j.orig!=i.orig) {
-          j.done=true;
-        }
-        if (OVERLAPS(i.block,i.block+i.len,j.block,j.block+j.len)) {
-          j.done=true;
-        }
-      }
-
-      // invalidate overlapping matches
-      for (BlockMatch& j: matches) {
-        if (OVERLAPS(i.orig,i.orig+i.len,j.orig,j.orig+j.len) ||
-            OVERLAPS(i.orig,i.orig+i.len,j.block,j.block+j.len) ||
-            OVERLAPS(i.block,i.block+i.len,j.orig,j.orig+j.len) ||
-            OVERLAPS(i.block,i.block+i.len,j.block,j.block+j.len)) {
-          j.done=true;
-        }
-      }
-    }
-
-    // try with a smaller size
-    matchSize=0;
-    for (BlockMatch& i: matches) {
-      if (i.done) continue;
-      if (i.len>matchSize) matchSize=i.len;
-    }
-    if (matchSize>=48) {
-      logV("trying next size %d",matchSize);
-    }
-  }
-
-  logV("done!");
-
-  // remove nop's
-  stream=stripNops(stream,speedDial);
-
-  /*
-  for (size_t groupSize=stream->size()>>1; groupSize>=48; groupSize-=8) {
-    bool foundSomething=false;
-    logV("...try size %d",groupSize);
-    for (size_t searchPos=0; (searchPos+groupSize)<stream->size();) {
-      const unsigned char* group=&buf[searchPos];
-      size_t groupLen=0;
-      size_t groupInsCount=0;
-      size_t subBlockID=subBlocks.size();
-      bool haveSub=false;
-      bool onlyCalls=true;
-
-      // register this block
-      for (size_t i=0; i<groupSize && i<stream->size(); i+=8) {
-        if (buf[searchPos+i]!=0xf4) onlyCalls=false;
-        if (groupLen+8>groupSize) break;
-        groupLen+=8;
-        groupInsCount++;
-      }
-
-      // don't do anything if we don't have a block large enough
-      if (groupLen<24) {
-        searchPos+=8;
-        continue;
-      }
-
-      // don't do anything if this is just one or two commands
-      // TODO: this is a duplicate of the previous statement now that all commands are 8 bytes long
-      if (groupInsCount<3) {
-        searchPos+=8;
-        continue;
-      }
-
-      // don't do anything if this block only consists of calls
-      if (onlyCalls) {
-        logW("nothing but calls.");
-        searchPos+=8;
-        continue;
-      }
-
-      // find identical blocks
-      for (size_t i=searchPos+groupLen; i+groupLen<stream->size();) {
-        // compare next block to group
-        if (memcmp(&buf[i],group,groupLen)==0) {
-          // we have a sub-block
-          if (!haveSub) {
-            // isolate this sub-block
-            SafeWriter* newBlock=new SafeWriter;
-            newBlock->init();
-            newBlock->write(group,groupLen);
-            newBlock->writeC(0xf9); // ret
-            // padding
-            newBlock->writeC(0);
-            newBlock->writeC(0);
-            newBlock->writeC(0);
-            newBlock->writeC(0);
-            newBlock->writeC(0);
-            newBlock->writeC(0);
-            newBlock->writeC(0);
-            subBlocks.push_back(newBlock);
-            haveSub=true;
-          logD("- SUB %x (size %d):",searchPos,groupLen);
+      } else {
+        nonOverlapCount++;
+        /*
+        bool onlyCalls=true;
+        for (size_t j=i.orig; j<i.orig+i.len; j+=8) {
+          if (buf[j]!=0xf4) {
+            onlyCalls=false;
+            break;
           }
-          logD("  - %x",i);
-          // insert call
-          buf[i]=0xf4;
-          buf[i+1]=subBlockID&0xff;
-          buf[i+2]=(subBlockID>>8)&0xff;
-          buf[i+3]=(subBlockID>>16)&0xff;
-          buf[i+4]=(subBlockID>>24)&0xff;
-          buf[i+5]=0;
-          buf[i+6]=0;
-          buf[i+7]=0;
+        }
+        if (onlyCalls) {
+          i.done=true;
+        } else {
+          nonOverlapCount++;
+        }
+        */
+      }
+    }
+
+    logD("%d good candidates",(int)nonOverlapCount);
+
+    // quit if there isn't anything
+    if (!nonOverlapCount) return stream;
+
+    // work on largest matches
+    // progress to smaller ones until we run out of them
+    logD("largest match: %d",(int)matchSize);
+
+    std::vector<BlockMatch> workMatches;
+    bool newBlocks=false;
+
+    while (matchSize>=48) {
+      workMatches.clear();
+      // find matches with matching size
+      for (BlockMatch& i: matches) {
+        if (i.len==matchSize) {
+          // mark it as done and push it
+          workMatches.push_back(i);
+          i.done=true;
+        }
+      }
+
+      // make sub-blocks
+      size_t lastOrig=SIZE_MAX;
+      size_t subBlockID=subBlocks.size();
+      for (BlockMatch& i: workMatches) {
+        // skip invalid matches (yes, this can happen)
+        if (i.done) continue;
+
+        // create new sub-block if necessary
+        if (i.orig!=lastOrig) {
+          subBlockID=subBlocks.size();
+          newBlocks=true;
+          logV("new sub-block %d",(int)subBlockID);
+
+          // isolate this sub-block
+          SafeWriter* newBlock=new SafeWriter;
+          newBlock->init();
+          newBlock->write(&buf[i.orig],i.len);
+          newBlock->writeC(0xf9); // ret
+          // padding
+          newBlock->writeC(0);
+          newBlock->writeC(0);
+          newBlock->writeC(0);
+          newBlock->writeC(0);
+          newBlock->writeC(0);
+          newBlock->writeC(0);
+          newBlock->writeC(0);
+          subBlocks.push_back(newBlock);
+          lastOrig=i.orig;
+
+          // insert call on the original block
+          buf[i.orig]=0xf4;
+          buf[i.orig+1]=subBlockID&0xff;
+          buf[i.orig+2]=(subBlockID>>8)&0xff;
+          buf[i.orig+3]=(subBlockID>>16)&0xff;
+          buf[i.orig+4]=(subBlockID>>24)&0xff;
+          buf[i.orig+5]=0;
+          buf[i.orig+6]=0;
+          buf[i.orig+7]=0;
 
           // replace the rest with nop
-          for (size_t j=i+8; j<i+groupLen; j++) {
+          for (size_t j=i.orig+8; j<i.orig+i.len; j++) {
             buf[j]=0xf1;
           }
-
-          // continue search from end of block
-          i+=groupLen;
-        } else {
-          // next
-          i+=8;
         }
-      }
 
-      if (haveSub) {
-        // insert call on the original block
-        buf[searchPos]=0xf4;
-        buf[searchPos+1]=subBlockID&0xff;
-        buf[searchPos+2]=(subBlockID>>8)&0xff;
-        buf[searchPos+3]=(subBlockID>>16)&0xff;
-        buf[searchPos+4]=(subBlockID>>24)&0xff;
-        buf[searchPos+5]=0;
-        buf[searchPos+6]=0;
-        buf[searchPos+7]=0;
+        // set match to the last sub-block
+        buf[i.block]=0xf4;
+        buf[i.block+1]=subBlockID&0xff;
+        buf[i.block+2]=(subBlockID>>8)&0xff;
+        buf[i.block+3]=(subBlockID>>16)&0xff;
+        buf[i.block+4]=(subBlockID>>24)&0xff;
+        buf[i.block+5]=0;
+        buf[i.block+6]=0;
+        buf[i.block+7]=0;
 
         // replace the rest with nop
-        for (size_t j=searchPos+8; j<searchPos+groupLen; j++) {
+        for (size_t j=i.block+8; j<i.block+i.len; j++) {
           buf[j]=0xf1;
         }
 
-        // skip this block (it's isolated now)
-        searchPos+=groupLen;
-        foundSomething=true;
-      } else {
-        // try again somewhere else
-        searchPos+=8;
+        // invalidate overlapping work matches
+        for (BlockMatch& j: workMatches) {
+          if (j.orig!=i.orig) {
+            j.done=true;
+          }
+          if (OVERLAPS(i.block,i.block+i.len,j.block,j.block+j.len)) {
+            j.done=true;
+          }
+        }
+
+        // invalidate overlapping matches
+        for (BlockMatch& j: matches) {
+          if (OVERLAPS(i.orig,i.orig+i.len,j.orig,j.orig+j.len) ||
+              OVERLAPS(i.orig,i.orig+i.len,j.block,j.block+j.len) ||
+              OVERLAPS(i.block,i.block+i.len,j.orig,j.orig+j.len) ||
+              OVERLAPS(i.block,i.block+i.len,j.block,j.block+j.len)) {
+            j.done=true;
+          }
+        }
+      }
+
+      // try with a smaller size
+      matchSize=0;
+      for (BlockMatch& i: matches) {
+        if (i.done) continue;
+        if (i.len>matchSize) matchSize=i.len;
+      }
+      if (matchSize>=48) {
+        logV("trying next size %d",matchSize);
       }
     }
-    if (foundSomething) {
-      stream=stripNops(stream,speedDial);
-      buf=stream->getFinalBuf();
-    }
+
+    logV("done!");
+
+    // get out if we haven't made any blocks
+    if (!newBlocks) break;
+
+    // remove nop's
+    stream=stripNops(stream);
+    buf=stream->getFinalBuf();
+
+    logV("doing it again...");
   }
-  */
+
   return stream;
 }
 
@@ -1343,7 +1259,7 @@ SafeWriter* DivEngine::saveCommand(DivCSProgress* progress, unsigned int disable
   // PASS 3: remove nop's
   // this includes modifying call addresses to compensate
   for (int h=0; h<chans; h++) {
-    chanStream[h]=stripNops(chanStream[h],sortedCmd);
+    chanStream[h]=stripNops(chanStream[h]);
   }
 
   // PASS 4: find sub-blocks and isolate them
@@ -1354,7 +1270,7 @@ SafeWriter* DivEngine::saveCommand(DivCSProgress* progress, unsigned int disable
       
       // 6 is the minimum size that can be reliably optimized
       logI("finding sub-blocks in chan %d",h);
-      chanStream[h]=findSubBlocks(chanStream[h],subBlocks,sortedCmd);
+      chanStream[h]=findSubBlocks(chanStream[h],subBlocks);
       // find sub-blocks within sub-blocks
       size_t subBlocksLast=0;
       size_t subBlocksLen=subBlocks.size();
@@ -1362,7 +1278,7 @@ SafeWriter* DivEngine::saveCommand(DivCSProgress* progress, unsigned int disable
       while (subBlocksLast!=subBlocksLen) {
         logI("got %d blocks... starting from %d",(int)subBlocksLen,(int)subBlocksLast);
         for (size_t i=subBlocksLast; i<subBlocksLen; i++) {
-          SafeWriter* newBlock=findSubBlocks(subBlocks[i],subBlocks,sortedCmd);
+          SafeWriter* newBlock=findSubBlocks(subBlocks[i],subBlocks);
           subBlocks[i]=newBlock;
         }
         subBlocksLast=subBlocksLen;
@@ -1433,7 +1349,7 @@ SafeWriter* DivEngine::saveCommand(DivCSProgress* progress, unsigned int disable
 
   // PASS 5: remove nop's (again)
   for (int h=0; h<chans; h++) {
-    chanStream[h]=stripNops(chanStream[h],sortedCmd);
+    chanStream[h]=stripNops(chanStream[h]);
   }
 
   // PASS 6: pack streams