Results 1 to 12 of 12

Thread: help me with CM in paq8

  1. #1
    Member
    Join Date
    Nov 2014
    Location
    China
    Posts
    11
    Thanks
    3
    Thanked 0 Times in 0 Posts

    help me with CM in paq8

    hi there,

    I have changed the execution sequence of contextmap::mix1 as following, will it cause bad consequence or degrade in compression ratio?

    My code:
    Code:
    int ContextMap::mix1(Mixer& m, int cc, int bp, int c1, int y1) {
        // Update model with y
        int result=0;
        for (int i=0; i<cn; ++i) {
            int rndResult = rnd[i]();
            if (cp[i]) {
                assert(cp[i]>=&t[0].bh[0][0] && cp[i]<=&t[t.size()-1].bh[6][6]);
                assert((long(cp[i])&63)>=15);
                int ns=nex(*cp[i], y1);
                if (ns>=204 && rndResult << ((452-ns)>>3)) ns-=4;  // probabilistic increment
                *cp[i]=ns;
            }
        }//CHANGED HERE: calc all nex() 
        
        if(g_bpos == 0)//CHANGED HERE: extracted from switch(g_bpos){default:}
        {
            for (int i=0; i<cn; ++i) {
                // Update run count of previous context
                if (runp[i][0]==0)  // new context
                    runp[i][0]=2, runp[i][1]=c1;
                else if (runp[i][1]!=c1)  // different byte in context
                    runp[i][0]=1, runp[i][1]=c1;
                else if (runp[i][0]<254)  // same byte in context
                    runp[i][0]+=2;
            }
        }
    
        for (int i=0; i<cn; ++i) {//CHANGED HERE: calc all t.get()
            // Update context pointers
            if (g_bpos>1 && current_runp0[i]==0)
            {
                cp[i]=0;
            }
            else
            {
                switch(g_bpos)
                {
                    case 1: case 3: case 6: cp[i]=cp0[i]+1+(cc&1); break;
                    case 4: case 7: cp[i]=cp0[i]+3+(cc&3); break;
                    case 2: case 5: cp0[i]=cp[i]=t[(cxt[i]+cc)&(t.size()-1)].get(cxt[i]>>16); break;
                    default:
                    {
                        cp0[i]=cp[i]=t[(cxt[i]+cc)&(t.size()-1)].get(cxt[i]>>16);
                        // Update pending bit histories for bits 2-7
                        if (cp0[i][3]==2) {
                            const int c=cp0[i][4]+256;
                            U8 *p=t[(cxt[i]+(c>>6))&(t.size()-1)].get(cxt[i]>>16);
                            p[0]=1+((c>>5)&1);
                            p[1+((c>>5)&1)]=1+((c>>4)&1);
                            p[3+((c>>4)&3)]=1+((c>>3)&1);
                            p=t[(cxt[i]+(c>>3))&(t.size()-1)].get(cxt[i]>>16);
                            p[0]=1+((c>>2)&1);
                            p[1+((c>>2)&1)]=1+((c>>1)&1);
                            p[3+((c>>1)&3)]=1+(c&1);
                            cp0[i][6]=0;
                        }
                        runp[i]=cp0[i]+3;
                        current_runp0[i] = runp[i][0];//CHANGED HERE: don't sync with main memory
                        current_runp1[i] = runp[i][1];//CHANGED HERE: don't sync with main memory
                    } break;
                }
            }
        }
      
        for (int i=0; i<cn; ++i) {//CHANGED HERE: calc all predict
    
            // predict from last byte in context
            if ((current_runp1[i]+256)>>(8-bp)==cc) {
                int rc=current_runp0[i];  // count*2, +1 if 2 different bytes seen
                int b=(current_runp1[i]>>(7-bp)&1)*2-1;  // predicted bit + for 1, - for 0
                int c=ilog(rc+1)<<(2+(~rc&1));
                m.add(b*c);
            }
            else
            {
                m.add(0);
            }
    
    
            // predict from bit context
            int p;;
            if (cp[i])
            {
                result+=(*cp[i]>0);
                p = sm[i].p(*cp[i]);
            }
            else
            {
                p = sm[i].p(0);
            }
            m.add(stretch(p));
    
        }
        
        if (bp==7) cn=0;
        return result;
    }
    Original CODE:
    Code:
    int ContextMap::mix1(Mixer& m, int cc, int bp, int c1, int y1) {
    
        // Update model with y
        int result=0;
        for (int i=0; i<cn; ++i) {
            if (cp[i]) {
                assert(cp[i]>=&t[0].bh[0][0] && cp[i]<=&t[t.size()-1].bh[6][6]);
                assert((long(cp[i])&63)>=15);
                int ns=nex(*cp[i], y1);
                if (ns>=204 && rnd() << ((452-ns)>>3)) ns-=4;  // probabilistic increment
                *cp[i]=ns;
            }
    
            // Update context pointers
            if (g_bpos>1 && runp[i][0]==0)
            {
                cp[i]=0;
            }
            else
            {
                switch(g_bpos)
                {
                    case 1: case 3: case 6: cp[i]=cp0[i]+1+(cc&1); break;
                    case 4: case 7: cp[i]=cp0[i]+3+(cc&3); break;
                    case 2: case 5: cp0[i]=cp[i]=t[(cxt[i]+cc)&(t.size()-1)].get(cxt[i]>>16); break;
                    default:
                    {
                        cp0[i]=cp[i]=t[(cxt[i]+cc)&(t.size()-1)].get(cxt[i]>>16);
                        // Update pending bit histories for bits 2-7
                        if (cp0[i][3]==2) {
                            const int c=cp0[i][4]+256;
                            U8 *p=t[(cxt[i]+(c>>6))&(t.size()-1)].get(cxt[i]>>16);
                            p[0]=1+((c>>5)&1);
                            p[1+((c>>5)&1)]=1+((c>>4)&1);
                            p[3+((c>>4)&3)]=1+((c>>3)&1);
                            p=t[(cxt[i]+(c>>3))&(t.size()-1)].get(cxt[i]>>16);
                            p[0]=1+((c>>2)&1);
                            p[1+((c>>2)&1)]=1+((c>>1)&1);
                            p[3+((c>>1)&3)]=1+(c&1);
                            cp0[i][6]=0;
                        }
                        // Update run count of previous context
                        if (runp[i][0]==0)  // new context
                            runp[i][0]=2, runp[i][1]=c1;
                        else if (runp[i][1]!=c1)  // different byte in context
                            runp[i][0]=1, runp[i][1]=c1;
                        else if (runp[i][0]<254)  // same byte in context
                            runp[i][0]+=2;
                        runp[i]=cp0[i]+3;
                    } break;
                }
            }
    
            // predict from last byte in context
            if ((runp[i][1]+256)>>(8-bp)==cc) {
                int rc=runp[i][0];  // count*2, +1 if 2 different bytes seen
                int b=(runp[i][1]>>(7-bp)&1)*2-1;  // predicted bit + for 1, - for 0
                int c=ilog(rc+1)<<(2+(~rc&1));
                m.add(b*c);
            }
            else
                m.add(0);
    
    
            // predict from bit context
            int p;;
            if (cp[i])
            {
                result+=(*cp[i]>0);
                p = sm[i].p(*cp[i]);
            }
            else
            {
                p = sm[i].p(0);
            }
            m.add(stretch(p));
    
    
        }
        if (bp==7) cn=0;
        return result;
    }

  2. #2
    Expert
    Matt Mahoney's Avatar
    Join Date
    May 2008
    Location
    Melbourne, Florida, USA
    Posts
    3,255
    Thanks
    306
    Thanked 778 Times in 485 Posts
    I don't think it should. What are your test results?

  3. #3
    Member
    Join Date
    Nov 2014
    Location
    China
    Posts
    11
    Thanks
    3
    Thanked 0 Times in 0 Posts
    Quote Originally Posted by Matt Mahoney View Post
    I don't think it should. What are your test results?
    I have tested a couple of data and the results show fine.

    But I still worry about the some worse case.

    By the way, as "runp[i]=cp0[i]+3" indicated, cp0[3,4] is the run count of previous context, but also for "bh[][3..6] = 3rd bit" , cp0[3..6] is the bit history, when some conflict happens, difference execution sequence just show difference result.

    For example just assume that for some i,j in [0,cn), i != j and cp0[i][3] and runp[j][0] refer to the same item, what is the expected result?
    And also may t[(cxt[i]+cc)&(t.size()-1)].get(cxt[i]>>16) just get the same array as previous "i", the result also varys.

  4. #4
    Expert
    Matt Mahoney's Avatar
    Join Date
    May 2008
    Location
    Melbourne, Florida, USA
    Posts
    3,255
    Thanks
    306
    Thanked 778 Times in 485 Posts
    It's been a long time since I looked at the code. But I recall updating the context hash every 3 bits which allowed for 7 counters and a 1 byte hash checksum. After the last bit there are 4 bytes left over for the run count. It worked nicely because it fit 8 rows into a 64 byte cache line to allow up to 8 hash searches. In zpaq I changed to updating every 4 bits (15 counters and a checksum) to get a little better speed but at a cost of wasted memory. Also, there isn't much gain after 3 hash searches because you lose more compression on missed collisions (probability 1/256) than you gain on saving space in the hash table. The run counters don't help much.

    Also, I seem to remember bp being a local copy of g_bpos to get better speed.

  5. The Following User Says Thank You to Matt Mahoney For This Useful Post:

    davidwjx (6th December 2014)

  6. #5
    Member
    Join Date
    Nov 2014
    Location
    China
    Posts
    11
    Thanks
    3
    Thanked 0 Times in 0 Posts
    Thank you for your remarkable work.

    Now I'd like to redesign the CM for some different computing environment, can you please point me to the right direction.

    I have diggged out the paq7 series, finding that "ContextMap = RunContextMap + NonstationaryContextMap". But it use one CM for each context; for contextModel2, there are "cm2(MEM), cm3(MEM), cm4(MEM),cm5(MEM*2), cm6(MEM*2), cm7(MEM*2), cm8(MEM*2)".

  7. #6
    Expert
    Matt Mahoney's Avatar
    Join Date
    May 2008
    Location
    Melbourne, Florida, USA
    Posts
    3,255
    Thanks
    306
    Thanked 778 Times in 485 Posts
    It should all be portable C++ except for the assembler code for the mixer, and there is an option to use the equivalent C++ code that is a little slower.

  8. #7
    Member
    Join Date
    Nov 2014
    Location
    China
    Posts
    11
    Thanks
    3
    Thanked 0 Times in 0 Posts
    Quote Originally Posted by Matt Mahoney View Post
    It should all be portable C++ except for the assembler code for the mixer, and there is an option to use the equivalent C++ code that is a little slower.
    Yes, sure.
    But I am trying to port it to parallel-execution environment, like cluster-computers.

    I have tried to slit CM to handle one cx for each instance, and combined many CM instances to get the right result, as following:
    Code:
    class ContextMap_Single {
      class E {  // hash element, 64 bytes
        U16 chk[7];  // byte context checksums
        U8 last;     // last 2 accesses (0-6) in low, high nibble
      public:
        U8 bh[7][7]; // byte context, 3-bit context -> bit history state
          // bh[][0] = 1st bit, bh[][1,2] = 2nd bit, bh[][3..6] = 3rd bit
          // bh[][0] is also a replacement priority, 0 = empty
        U8* get(U16 chk);  // Find element (0-6) matching checksum.
          // If not found, insert or replace lowest priority (not last).
      };
      Array<E, 64> t;  // bit histories for bits 0-1, 2-4, 5-7
        // For 0-1, also contains a run count in bh[][4] and value in bh[][5]
        // and pending update count in bh[7]
      U8* cp;   // C pointers to current bit history
      U8* cp0;  // First element of 7 element array containing cp[i]
      U32 cxt;  // C whole byte contexts (hashes)
      U8* runp; // C [0..3] = count, value, unused, unused
      StateMap *sm;    // C maps of state -> p
      void update(U32 cx, int c);  // train model that context cx predicts c
    public:
      int mix1(Mixer& m, int cc, int bp, int c1, int y1,FILE*fp);
        // mix() with global context passed as arguments to improve speed.
    public:
      ContextMap_Single(int m);  // m = memory in bytes, a power of 2, C = c
      ~ContextMap_Single();
      void set(U32 cx);   // set next whole byte context to cx
        // if next is 0 then set order does not matter
      Random* rnd;
    };
    
    // Find or create hash element matching checksum ch
    inline U8* ContextMap_Single::E::get(U16 ch) {
      if (chk[last&15]==ch) return &bh[last&15][0];
      int b=0xffff, bi=0;
      for (int i=0; i<7; ++i) {
        if (chk[i]==ch) return last=last<<4|i, (U8*)&bh[i][0];
        int pri=bh[i][0];
        if (pri<b && (last&15)!=i && last>>4!=i) b=pri, bi=i;
      }
      return last=0xf0|bi, chk[bi]=ch, (U8*)memset(&bh[bi][0], 0, 7);
    }
    
    // Construct using m bytes of memory for c contexts
    ContextMap_Single::ContextMap_Single(int m): t(m>>6), cp(NULL), cp0(NULL),
        cxt(0), runp(NULL) {
      assert(m>=64 && (m&m-1)==0);  // power of 2?
      assert(sizeof(E)==64);
      sm=new StateMap;
      {
        cp0=cp=&t[0].bh[0][0];
        runp=cp+3;
      }
      
      rnd = new Random;
    }
    
    ContextMap_Single::~ContextMap_Single() {
      delete sm;
      delete rnd;
    }
    
    // Set the i'th context to cx
    inline void ContextMap_Single::set(U32 cx) {
      cxt=cx;
    }
    
    // Update the model with bit y1, and predict next bit to mixer m.
    // Context: cc=c0, bp=bpos, c1=buf(1), y1=y.
    int ContextMap_Single::mix1(Mixer& m, int cc, int bp, int c1, int y1,FILE*fp) {
      int madd = 0;//wjx
      // Update model with y
      int result=0;
      {
        int rndValue = rnd[0](); 
        if (cp) {
          assert(cp>=&t[0].bh[0][0] && cp<=&t[t.size()-1].bh[6][6]);
          assert((long(cp)&63)>=15);
          int ns=nex(*cp, y1);
          if (ns>=204 && rndValue << ((452-ns)>>3)) ns-=4;  // probabilistic increment
          *cp=ns;
        }
    
        // Update context pointers
        if (g_bpos>1 && runp[0]==0)
        {
         cp=0;
        }
        else
        {
         switch(g_bpos)
         {
          case 1: case 3: case 6: cp=cp0+1+(cc&1); break;
          case 4: case 7: cp=cp0+3+(cc&3); break;
          case 2: case 5: cp0=cp=t[(cxt+cc)&(t.size()-1)].get(cxt>>16); break;
          default:
          {
           cp0=cp=t[(cxt+cc)&(t.size()-1)].get(cxt>>16);
           // Update pending bit histories for bits 2-7
           if (cp0[3]==2) {
             const int c=cp0[4]+256;
             U8 *p=t[(cxt+(c>>6))&(t.size()-1)].get(cxt>>16);
             p[0]=1+((c>>5)&1);
             p[1+((c>>5)&1)]=1+((c>>4)&1);
             p[3+((c>>4)&3)]=1+((c>>3)&1);
             p=t[(cxt+(c>>3))&(t.size()-1)].get(cxt>>16);
             p[0]=1+((c>>2)&1);
             p[1+((c>>2)&1)]=1+((c>>1)&1);
             p[3+((c>>1)&3)]=1+(c&1);
             cp0[6]=0;
           }
           // Update run count of previous context
           if (runp[0]==0)  // new context
             runp[0]=2, runp[1]=c1;
           else if (runp[1]!=c1)  // different byte in context
             runp[0]=1, runp[1]=c1;
           else if (runp[0]<254)  // same byte in context
             runp[0]+=2;
           runp=cp0+3;
          } break;
         }
        }
    
       // predict from last byte in context
       if ((runp[1]+256)>>(8-bp)==cc) {
         int rc=runp[0];  // count*2, +1 if 2 different bytes seen
         int b=(runp[1]>>(7-bp)&1)*2-1;  // predicted bit + for 1, - for 0
         int c=ilog(rc+1)<<(2+(~rc&1));
         m.add(b*c);
         madd = b*c;
       }
       else
       {
         m.add(0);
         madd = 0;
       }
    
    
        // predict from bit context
       int p;;
       if (cp)
       {
        result+=(*cp>0);
        p = sm->p(*cp);
       }
       else
       {
        p = sm->p(0);
       }
       m.add(stretch(p));
    
       if(fp)
       {
          fprintf(fp,"%d-%d,%d;",cxt,madd,stretch(p));
       }
      }
      return result;
    }
    
    class ContextMap_Mul {
      const int C;  // max number of contexts
      int cn;          // Next context to set by set()
      Array<U32> cxt;  // C whole byte contexts (hashes)
      ContextMap_Single** cm_single;
      void update(U32 cx, int c);  // train model that context cx predicts c
      int mix1(Mixer& m, int cc, int bp, int c1, int y1,FILE*fp);
        // mix() with global context passed as arguments to improve speed.
    public:
      ContextMap_Mul(int m, int c=1);  // m = memory in bytes, a power of 2, C = c
      ~ContextMap_Mul();
      void set(U32 cx, int next=-1);   // set next whole byte context to cx
        // if next is 0 then set order does not matter
      int mix(Mixer& m,FILE*fp=NULL) {return mix1(m, g_c0, g_bpos, g_buf(1), g_y,fp);}
    };
    
    // Construct using m bytes of memory for c contexts
    ContextMap_Mul::ContextMap_Mul(int m, int c): C(c), cm_single(new ContextMap_Single*[c]),
        cxt(c), cn(0) {
      assert(m>=64 && (m&m-1)==0);
      for(int i = 0 ; i < C; i ++)
      {
          cm_single[i] = new ContextMap_Single(m);
      }
    }
    
    ContextMap_Mul::~ContextMap_Mul() {
      
      for(int i = 0 ; i < C; i ++)
      {
          delete cm_single[i];
      }
      delete[] cm_single;
    }
    
    // Set the i'th context to cx
    inline void ContextMap_Mul::set(U32 cx, int next) {
      int i=cn++;
      i&=next;
      assert(i>=0 && i<C);
      cx=cx*987654323+i;  // permute (don't hash) cx to spread the distribution
      cx=cx<<16|cx>>16;
      cxt[i]=cx*123456791+i;
      cm_single[i]->set(cxt[i]);
    }
    
    // Update the model with bit y1, and predict next bit to mixer m.
    // Context: cc=c0, bp=bpos, c1=buf(1), y1=y.
    int ContextMap_Mul::mix1(Mixer& m, int cc, int bp, int c1, int y1,FILE*fp) {
      if(fp)
      {
          fprintf(fp,"%d,%d,%d,%d\t",cc,bp,c1,y1);
      }
      // Update model with y
      int result=0;
      for (int i=0; i<cn; ++i) {
        result += cm_single[i]->mix1(m,cc,bp,c1,y1,fp);
      }
      if(fp)
        fprintf(fp,"\n");
    
      if (bp==7) cn=0;
      return result;
    }
    The compress ratio of testing result shows as well. But it use too much memory, and doesn't speed up.

    The original CM uses 500 ns on average, but the ContextMap_Single uses 200 ns. And I think it is because of the cache.

    And my goal is
    1.try to speed up
    2.use less memory as compared to ContextMap_Mul.

  9. #8
    Expert
    Matt Mahoney's Avatar
    Join Date
    May 2008
    Location
    Melbourne, Florida, USA
    Posts
    3,255
    Thanks
    306
    Thanked 778 Times in 485 Posts
    PAQ isn't really well suited for multi-threading. You can run all the context models in parallel but you have to serialize to mix them, run the SSE (APM) stages and arithmetic coding. This happens 8 times per byte of data. You could pipeline it by delaying updates to the model, but then you lose some compression. You will also lose compression because you have to divide up the memory between threads, and lose speed because you have to share the cache too. You can't parallelize shared random memory access, and that is already the bottleneck.

    It is probably easier to divide up the data into blocks and compress them independently in parallel. That is what I do in ZPAQ. Of course you still lose some compression and use more memory. For really big data sets it makes sense to look for mutual information on disk instead of trying to keep it in memory. So I dedupe and group the remaining files by type. Also, don't waste time compressing already compressed data, which is what makes up a large part of most file systems.

  10. The Following User Says Thank You to Matt Mahoney For This Useful Post:

    davidwjx (7th December 2014)

  11. #9
    Member
    Join Date
    Nov 2014
    Location
    China
    Posts
    11
    Thanks
    3
    Thanked 0 Times in 0 Posts
    Quote Originally Posted by Matt Mahoney View Post
    You can run all the context models in parallel but you have to serialize to mix them, run the SSE (APM) stages and arithmetic coding. This happens 8 times per byte of data.
    As I have gprof the paq8, the Mixer and APM are not quite the bottleneck.I can use dsp to speed up.

    The bottleneck is the CM, and the random-memory-access throughput domains this problem.

    If I can assign DDR3 controller to each contextmap_single, it will speed up much. But it need too much memory.
    As ContextMap_Single in #7, it uses about 25 cycles for each byte, while the original Contextmap uses about 164 cycles for each bytes.

    I try to find some tradeoff between memory and speed.
    Last edited by davidwjx; 8th December 2014 at 03:41.

  12. #10
    Member
    Join Date
    Nov 2014
    Location
    China
    Posts
    11
    Thanks
    3
    Thanked 0 Times in 0 Posts
    Code:
    if (cp0[3]==2) {
             const int c=cp0[4]+256;
             U8 *p=t[(cxt+(c>>6))&(t.size()-1)].get(cxt>>16);
             p[0]=1+((c>>5)&1);
             p[1+((c>>5)&1)]=1+((c>>4)&1);
             p[3+((c>>4)&3)]=1+((c>>3)&1);
             p=t[(cxt+(c>>3))&(t.size()-1)].get(cxt>>16);
             p[0]=1+((c>>2)&1);
             p[1+((c>>2)&1)]=1+((c>>1)&1);
             p[3+((c>>1)&3)]=1+(c&1);
             cp0[6]=0;
           }
    By the way, what does these codes mean? There are two t[...].get(cxt>>16), which accesses the main memory for about 512bit.

    Is there any way to optimize it out?

  13. #11
    Member
    Join Date
    Nov 2014
    Location
    China
    Posts
    11
    Thanks
    3
    Thanked 0 Times in 0 Posts
    Suppose the cache is big enough, is there any way to minimize the main-memory access? Such as access the main memory once for each byte?

  14. #12
    Expert
    Matt Mahoney's Avatar
    Join Date
    May 2008
    Location
    Melbourne, Florida, USA
    Posts
    3,255
    Thanks
    306
    Thanked 778 Times in 485 Posts
    Not sure but I think both memory accesses are in the same 64 byte cache line.

Similar Threads

  1. FP8 (= Fast PAQ8)
    By Jan Ondrus in forum Data Compression
    Replies: 65
    Last Post: 1st April 2019, 10:05
  2. PAQ8 - Download Page
    By Jan Ondrus in forum Data Compression
    Replies: 7
    Last Post: 7th October 2010, 21:14
  3. deflate model for paq8?
    By kaitz in forum Data Compression
    Replies: 2
    Last Post: 6th February 2009, 21:48
  4. PAQ8 tests
    By kaitz in forum Forum Archive
    Replies: 4
    Last Post: 17th January 2008, 15:03
  5. PeaZip v1.3 now with PAQ8 support!
    By LovePimple in forum Forum Archive
    Replies: 29
    Last Post: 9th February 2007, 16:58

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •