"""
    chacha.py
    
    An implementation of ChaCha in about 130 operative lines 
    of 100% pure Python code.
    
    Copyright (c) 2009-2011 by Larry Bugbee, Kent, WA
    ALL RIGHTS RESERVED.

    chacha.py IS EXPERIMENTAL SOFTWARE FOR EDUCATIONAL
    PURPOSES ONLY.  IT IS MADE AVAILABLE "AS-IS" WITHOUT 
    WARRANTY OR GUARANTEE OF ANY KIND.  USE SIGNIFIES 
    ACCEPTANCE OF ALL RISK.  

    To make your learning and experimentation less cumbersome, 
    chacha.py is free for any use.      
    
    This implementation is intended for Python 2.x.
    
    Larry Bugbee
    May 2009     (Salsa20)
    August 2009  (ChaCha)
    rev June 2010
    rev March 2011  - tweaked _quarterround() to get 20-30% speed gain
"""

import struct
try:
    import psyco            # a specializing [runtime] compiler
    have_psyco = True       # for 32-bit architectures
    print 'psyco enabled'
except:
    have_psyco = False
    
#-----------------------------------------------------------------------

class ChaCha(object):
    """
        ChaCha is an improved variant of Salsa20.
        
        Salsa20 was submitted to eSTREAM, an EU stream cipher
        competition.  Salsa20 was originally defined to be 20
        rounds.  Reduced round versions, Salsa20/8 (8 rounds) and
        Salsa20/12 (12 rounds), were later submitted.  Salsa20/12
        was chosen as one of the winners and 12 rounds was deemed
        the "right blend" of security and efficiency.  Salsa20 
        is about 3x-4x faster than AES-128.
        
        Both ChaCha and Salsa20 accept a 128-bit or a 256-bit key 
        and a 64-bit IV to set up an initial 64-byte state.  For 
        each 64-bytes of data, the state gets scrambled and XORed 
        with the previous state.  This new state is then XORed 
        with the input data to produce the output.  Both being 
        stream ciphers, their encryption and decryption functions 
        are identical.  
        
        While Salsa20's diffusion properties are very good, some 
        claimed the IV/keystream correlation was too strong for 
        comfort.  To satisfy, another variant called XSalsa20 
        implements a 128-bit IV.  For the record, EU eSTREAM team 
        did select Salsa20/12 as a solid cipher providing 128-bit 
        security.  
        
        ChaCha is a minor tweak of Salsa20 that significantly 
        improves its diffusion per round.  ChaCha is more secure 
        than Salsa20 and 8 rounds of ChaCha, aka ChaCha8, provides 
        128-bit security.  (FWIW, I have not seen any calls for a 
        128-bit IV version of ChaCha or XChaCha.)  
        
        Another benefit is that ChaCha8 is about 5-8% faster than 
        Salsa20/8 on most 32- and 64-bit PPC and Intel processors.  
        SIMD machines should see even more improvement.  
        
        Sample usage:
          from chacha import ChaCha
          
          cc8 = ChaCha(key, iv)
          ciphertext = cc8.encrypt(plaintext)
          
          cc8 = ChaCha(key, iv)
          plaintext = cc8.decrypt(ciphertext)
        
        Remember, the purpose of this program is educational; it 
        is NOT a secure implementation nor is a pure Python version 
        going to be fast.  Encrypting large data will be less than 
        satisfying.  Also, no effort is made to protect the key or 
        wipe critical memory after use.  
        
        Note that psyco, a specializing compiler somewhat akin to 
        a JIT, can provide a 2x+ performance improvement over 
        vanilla Python 32-bit architectures.  A 64-bit version of 
        psyco does not exist.  See http://psyco.sourceforge.net
        
        For more information about Daniel Bernstein's ChaCha 
        algorithm, please see http://cr.yp.to/chacha.html
        
        All we need now is a keystream AND authentication in the 
        same pass.  
        
        Larry Bugbee
        May 2009     (Salsa20)
        August 2009  (ChaCha)
        rev June 2010
    """

    TAU    = ( 0x61707865, 0x3120646e, 0x79622d36, 0x6b206574 )
    SIGMA  = ( 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 )
    ROUNDS = 8                         # ...10, 12, 20?

    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    def __init__(self, key, iv, rounds=ROUNDS):
        """ Both key and iv are byte strings.  The key must be 
            exactly 16 or 32 bytes, 128 or 256 bits respectively.  
            The iv must be exactly 8 bytes (64 bits) and MUST 
            never be reused with the same key.
            
            The default number of rounds is 8.

            If you have several encryptions/decryptions that use 
            the same key, you may reuse the same instance and 
            simply call iv_setup() to set the new iv.  The previous 
            key and the new iv will establish a new state.
        """
        self._key_setup(key)
        self.iv_setup(iv)
        self.rounds = rounds

    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    def _key_setup(self, key):
        """ key is converted to a list of 4-byte unsigned integers
            (32 bits).

            Calling this routine with a key value effectively resets
            the context/instance.  Be sure to set the iv as well.
        """
        if len(key) not in [16, 32]:
            raise Exception('key must be either 16 or 32 bytes')
        key_state = [0]*16
        if len(key) == 16:
            k = list(struct.unpack('<4I', key))
            key_state[0]  = self.TAU[0]
            key_state[1]  = self.TAU[1]
            key_state[2]  = self.TAU[2]
            key_state[3]  = self.TAU[3]
            key_state[4]  = k[0]
            key_state[5]  = k[1]
            key_state[6]  = k[2]
            key_state[7]  = k[3]
            key_state[8]  = k[0]
            key_state[9]  = k[1]
            key_state[10] = k[2]
            key_state[11] = k[3]
            # 12 and 13 are reserved for the counter
            # 14 and 15 are reserved for the IV

        elif len(key) == 32:
            k = list(struct.unpack('<8I', key))
            key_state[0]  = self.SIGMA[0]
            key_state[1]  = self.SIGMA[1]
            key_state[2]  = self.SIGMA[2]
            key_state[3]  = self.SIGMA[3]
            key_state[4]  = k[0]
            key_state[5]  = k[1]
            key_state[6]  = k[2]
            key_state[7]  = k[3]
            key_state[8]  = k[4]
            key_state[9]  = k[5]
            key_state[10] = k[6]
            key_state[11] = k[7]
            # 12 and 13 are reserved for the counter
            # 14 and 15 are reserved for the IV
        self.key_state = key_state

    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    def iv_setup(self, iv):
        """ self.state and other working structures are lists of
            4-byte unsigned integers (32 bits).

            The iv is not a secret but it should never be reused 
            with the same key value.  Use date, time or some other
            counter to be sure the iv is different each time, and
            be sure to communicate the IV to the receiving party.
            Prepending 8 bytes of iv to the ciphertext is the usual
            way to do this.

            Just as setting a new key value effectively resets the
            context, setting the iv also resets the context with
            the last key value entered.
        """
        if len(iv) != 8:
            raise Exception('iv must be 8 bytes')
        v = list(struct.unpack('<2I', iv))
        iv_state = self.key_state[:]
        iv_state[12] = 0
        iv_state[13] = 0
        iv_state[14] = v[0]
        iv_state[15] = v[1]
        self.state = iv_state
        self.lastblock_sz = 64      # init flag - unsafe to continue
                                    # processing if not 64

    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    def encrypt(self, datain):
        """ Encrypt a chunk of data.  datain and the returned value 
            are byte strings.

            If large data is submitted to this routine in chunks,
            the chunk size MUST be an exact multiple of 64 bytes.
            Only the final chunk may be less than an even multiple.
            (This function does not "save" any uneven, left-over 
            data for concatenation to the front of the next chunk.)
            
            The amount of available memory imposes a poorly defined
            limit on the amount of data this routine can process.
            Typically 10's and 100's of KB are available but then,
            perhaps not.  This routine is intended for educational 
            purposes so application developers should take heed.
        """
        if self.lastblock_sz != 64:
            raise Exception('last chunk size not a multiple of 64 bytes')
        dataout = []
        while datain:
            # generate 64 bytes of cipher stream
            stream = self._chacha_scramble();
            # XOR the stream onto the next 64 bytes of data
            dataout.append(self._xor(stream, datain))
            if len(datain) <= 64:
                self.lastblock_sz = len(datain)
                return ''.join(dataout)
            # increment the iv.  In this case we increment words
            # 12 and 13 in little endian order.  This will work 
            # nicely for data up to 2^70 bytes (1,099,511,627,776GB) 
            # in length.  After that it is the user's responsibility 
            # to generate a new nonce/iv.
            self.state[12] = (self.state[12] + 1) & 0xffffffff
            if self.state[12] == 0:           # if overflow in state[12]
                self.state[13] += 1           # carry to state[13]
                # not to exceed 2^70 x 2^64 = 2^134 data size ??? <<<<
            # get ready for the next iteration
            datain = datain[64:]
        # should never get here
        raise Exception('Huh?')
    
    decrypt = encrypt
    
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    
    def _chacha_scramble(self):     # 64 bytes in
        """ self.state and other working strucures are lists of
            4-byte unsigned integers (32 bits).

            output must be converted to bytestring before return.
        """
        x = self.state[:]           # makes a copy
        for i in xrange(0, self.rounds, 2):
            # two rounds per iteration
            self._quarterround(x, 0, 4, 8,12)
            self._quarterround(x, 1, 5, 9,13)
            self._quarterround(x, 2, 6,10,14)
            self._quarterround(x, 3, 7,11,15)
            
            self._quarterround(x, 0, 5,10,15)
            self._quarterround(x, 1, 6,11,12)
            self._quarterround(x, 2, 7, 8,13)
            self._quarterround(x, 3, 4, 9,14)
            
        for i in xrange(16):
            x[i] = (x[i] + self.state[i]) & 0xffffffff
        output = struct.pack('<16I',
                            x[ 0], x[ 1], x[ 2], x[ 3],
                            x[ 4], x[ 5], x[ 6], x[ 7],
                            x[ 8], x[ 9], x[10], x[11],
                            x[12], x[13], x[14], x[15])
        return output               # 64 bytes out
    
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    
    '''
    # as per definition - deprecated
    def _quarterround(self, x, a, b, c, d):
        x[a] = (x[a] + x[b]) & 0xFFFFFFFF
        x[d] = self._rotate((x[d]^x[a]), 16)
        x[c] = (x[c] + x[d]) & 0xFFFFFFFF
        x[b] = self._rotate((x[b]^x[c]), 12)
        
        x[a] = (x[a] + x[b]) & 0xFFFFFFFF
        x[d] = self._rotate((x[d]^x[a]), 8)
        x[c] = (x[c] + x[d]) & 0xFFFFFFFF
        x[b] = self._rotate((x[b]^x[c]), 7)
        
    def _rotate(self, v, n):        # aka ROTL32
        return ((v << n) & 0xFFFFFFFF) | (v >> (32 - n))
    '''
    
    # surprisingly, the following tweaks/accelerations provide 
    # about a 20-40% gain
    def _quarterround(self, x, a, b, c, d):
        xa = x[a]
        xb = x[b]
        xc = x[c]
        xd = x[d]
        
        xa  = (xa + xb)  & 0xFFFFFFFF
        tmp =  xd ^ xa
        xd  = ((tmp << 16) & 0xFFFFFFFF) | (tmp >> 16)  # 16=32-16
        xc  = (xc + xd)  & 0xFFFFFFFF
        tmp =  xb ^ xc
        xb  = ((tmp << 12) & 0xFFFFFFFF) | (tmp >> 20)  # 20=32-12
        
        xa  = (xa + xb)  & 0xFFFFFFFF
        tmp =  xd ^ xa
        xd  = ((tmp <<  8) & 0xFFFFFFFF) | (tmp >> 24)  # 24=32-8
        xc  = (xc + xd)  & 0xFFFFFFFF
        tmp =  xb ^ xc
        xb  = ((tmp <<  7) & 0xFFFFFFFF) | (tmp >> 25)  # 25=32-7
        
        x[a] = xa
        x[b] = xb
        x[c] = xc
        x[d] = xd
    
    
    def _xor(self, stream, datain):
        dataout = []
        for i in xrange(min(len(stream), len(datain))):
            dataout.append(chr(ord(stream[i])^ord(datain[i])))
        return ''.join(dataout)
    
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    
    if have_psyco:
        # if you psyco encrypt() and _chacha_scramble() you
        # should get a 2.4x speedup over vanilla Python 2.5.  
        # The other functions seem to offer only negligible 
        # improvement.  YMMV.
        
        _key_setup = psyco.proxy(_key_setup)    # small impact
        iv_setup   = psyco.proxy(iv_setup)      # small impact
        encrypt    = psyco.proxy(encrypt)                   # 18-32%
        _chacha_scramble = psyco.proxy(_chacha_scramble)    # big help, 2x
        _quarterround    = psyco.proxy(_quarterround)       # ???
#        _rotate = psyco.proxy(_rotate)          # minor impact
        _xor    = psyco.proxy(_xor)             # very small impact
        pass

#-----------------------------------------------------------------------
#-----------------------------------------------------------------------
#-----------------------------------------------------------------------