/* Arithmetic on lists of residues modulo n.

  Copyright 2001, 2002, 2003, 2004, 2005 Paul Zimmermann and Alexander Kruppa.

  This file is part of the ECM Library.

  The ECM Library is free software; you can redistribute it and/or modify
  it under the terms of the GNU Lesser General Public License as published by
  the Free Software Foundation; either version 2.1 of the License, or (at your
  option) any later version.

  The ECM Library is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  License for more details.

  You should have received a copy of the GNU Lesser General Public License
  along with the ECM Library; see the file COPYING.LIB.  If not, write to
  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  MA 02111-1307, USA.
*/

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include "gmp.h"
#include "ecm.h"
#include "ecm-impl.h"

#ifdef DEBUG
#define ASSERTD(x) assert(x)
#else
#define ASSERTD(x)
#endif

#if (MULT == KS)
#define LIST_MULT_N kronecker_schonhage
#ifdef HAVE_FFT
#define WRAP /* use wrap-around multiplication for low short product */
#endif
#elif (MULT == TOOM4)
#define LIST_MULT_N toomcook4
#elif (MULT == TOOM3)
#define LIST_MULT_N toomcook3
#elif (MULT == KARA)
#define LIST_MULT_N karatsuba
#else
#error "MULT is neither KS, TOOM4, nor TOOM3, nor KARA"
#endif

extern unsigned int Fermat;

/* returns a bound on the auxiliary memory needed by LIST_MULT_N */
int
list_mul_mem (unsigned int len)
{
  unsigned int mem;

  mem = 2 * len;
#if defined(TOOMCOOK3) || defined(TOOMCOOK4)
  while (len > 3)
    {
      mem += 2;
      len = (len + 2) / 3; /* ceil(len/3) */
    }
  mem += 4;
#endif
  return mem;
}

/* creates a list of n integers, return NULL if error */
listz_t
init_list (unsigned int n)
{
  listz_t p;
  unsigned int i;

  p = (mpz_t*) malloc (n * sizeof(mpz_t));
  if (p == NULL)
    return NULL;
  for (i = 0; i < n; i++)
    mpz_init (p[i]);
  return p;
}

/* clears a list of n integers */
void
clear_list (listz_t p, unsigned int n)
{
  unsigned int i;

  if (p == NULL)
    return;
  for (i = 0; i < n; i++)
    mpz_clear (p[i]);
  free (p);
}

#ifdef DEBUG
/* prints a list of n coefficients as a polynomial */
void
print_list (listz_t p, unsigned int n)
{
  unsigned int i;

  for (i = 0; i < n; i++)
    {
      if (i > 0 && mpz_cmp_ui (p[i], 0) >= 0)
        fprintf (ECM_STDOUT, "+");
      mpz_out_str (ECM_STDOUT, 10, p[i]);
      fprintf (ECM_STDOUT, "*x^%u", i);
    }
  fprintf (ECM_STDOUT, "\n");
}

static int
list_check (listz_t a, unsigned int l, mpz_t n)
{
  unsigned int i;

  for (i = 0; i < l; i++)
    if (mpz_cmp_ui (a[i], 0) < 0 || mpz_cmp (n, a[i]) <= 0)
      {
        fprintf (ECM_STDOUT, "l=%u i=%u\n", l, i);
        mpz_out_str (ECM_STDOUT, 10, a[i]);
	fprintf (ECM_STDOUT, "\n");
        return 0;
      }
  return 1;
}
#endif /* DEBUG */

/* Read all entries in list from stream. 
   Return 0 on success, ECM_ERROR on error */
int
list_inp_raw (listz_t a, FILE *f, unsigned int n)
{
  unsigned int i;
  
  for (i = 0; i < n; i++)
    if (mpz_inp_raw (a[i], f) == 0)
      return ECM_ERROR;
  
  return 0;
}

/* Write all entries in list to stream. 
   Return 0 on success, ECM_ERROR on error */
int
list_out_raw (FILE *f, listz_t a, unsigned int n)
{
  unsigned int i;
  
  for (i = 0; i < n; i++)
    if (mpz_out_raw (f, a[i]) == 0)
      return ECM_ERROR;
  
  return 0;
}

/* p <- q */
void
list_set (listz_t p, listz_t q, unsigned int n)
{
  unsigned int i;

  for (i = 0; i < n; i++)
    mpz_set (p[i], q[i]);
}

/* p[0] <-> p[n], p[1] <-> p[n-1], ... */
void
list_revert (listz_t p, unsigned int n)
{
  unsigned int i;

  for (i = 0; i < n - i; i++)
    mpz_swap (p[i], p[n - i]);
}

void
list_swap (listz_t p, listz_t q, unsigned int n)
{
  unsigned int i;

  for (i = 0; i < n; i++)
    mpz_swap (p[i], q[i]);
}

/* p <- -q, keeps residues normalized */
static void
list_neg (listz_t p, listz_t q, unsigned int l, mpz_t n)
{
  unsigned int i;

  for (i = 0; i < l; i++)
    {
      if (mpz_sgn (q[i]))
        mpz_sub (p[i], n, q[i]);
      else
        mpz_set_ui (p[i], 0);
    }
}

/* p <- q modulo mod */
void
list_mod (listz_t p, listz_t q, unsigned int n, mpz_t mod)
{
  unsigned int i;

  for (i = 0; i < n; i++)
    mpz_mod (p[i], q[i], mod);
}

/* p <- q + r */
void
list_add (listz_t p, listz_t q, listz_t r, unsigned int l)
{
  unsigned int i;

  for (i = 0; i < l; i++)
    mpz_add (p[i], q[i], r[i]);
}

/* p <- q - r */
void
list_sub (listz_t p, listz_t q, listz_t r, unsigned int l)
{
  unsigned int i;

  for (i = 0; i < l; i++)
    mpz_sub (p[i], q[i], r[i]);
}

/* p[i] <- q[i] * r mod m */
void
list_mul_z (listz_t p, listz_t q, mpz_t r, unsigned int n, mpz_t m)
{
  unsigned int i;

  for (i = 0; i < n; i++)
    {
      mpz_mul (p[i], q[i], r);
      mpz_mod (p[i], p[i], m);
    }
}

/* p <- gcd(n, l[0]*l[1]*...*l[k-1],
   returns non-zero iff p is non trivial */
int
list_gcd (mpz_t p, listz_t l, unsigned int k, mpz_t n)
{
  unsigned int i;
  
  for (i=1; i<k; i++)
    {
      mpz_mul (l[0], l[0], l[i]);
      mpz_mod (l[0], l[0], n);
    }
  mpz_gcd (p, l[0], n);

  return mpz_cmp_ui (p, 1);
}

/* p <- 0 */
void
list_zero (listz_t p, unsigned int n)
{
  unsigned int i;

  for (i = 0; i < n; i++)
    mpz_set_ui (p[i], 0);
}

/* puts in a[0]..a[K-1] the K low terms of the product 
   of b[0..K-1] and c[0..K-1].
   Assumes K >= 1, and a[0..2K-2] exist.
   Needs space for list_mul_mem(K) in t.
*/
static void
list_mul_low (listz_t a, listz_t b, listz_t c, unsigned int K, listz_t t,
	      mpz_t n)
{
  unsigned int p, q;

  switch (K)
    {
    case 1:
      mpz_mul (a[0], b[0], c[0]);
      return;
    case 2:
      mpz_mul (a[0], b[0], c[0]);
      mpz_mul (a[1], b[0], c[1]);
      mpz_addmul (a[1], b[1], c[0]);
      return;
    case 3:
      karatsuba (a, b, c, 2, t);
      mpz_addmul (a[2], b[2], c[0]);
      mpz_addmul (a[2], b[0], c[2]);
      return;
    default:
      /* MULT is 2 for Karatsuba, 3 for Toom3, 4 for Toom4 */
      for (p = 1; MULT * p <= K; p *= MULT); /* p = greatest power of MULT <=K */
      p = (K / p) * p;
      ASSERTD(list_check(b,p,n) && list_check(c,p,n));
      LIST_MULT_N (a, b, c, p, t);
      if ((q = K - p))
        {
          list_mul_low (t, b + p, c, q, t + 2 * q - 1, n);
          list_add (a + p, a + p, t, q);
          list_mul_low (t, c + p, b, q, t + 2 * q - 1, n);
          list_add (a + p, a + p, t, q);
        }
    }
}

/* puts in a[K-1]..a[2K-2] the K high terms of the product 
   of b[0..K-1] and c[0..K-1].
   Assumes K >= 1, and a[0..2K-2] exist.
   Needs space for list_mul_mem(K) in t.
*/
void
list_mul_high (listz_t a, listz_t b, listz_t c, unsigned int K, listz_t t)
{
#ifdef KS_MULTIPLY /* ks is faster */
  LIST_MULT_N (a, b, c, K, t);
#else
  unsigned int p, q;

  switch (K)
    {
    case 1:
      mpz_mul (a[0], b[0], c[0]);
      return;
      
    case 2:
      mpz_mul (a[2], b[1], c[1]);
      mpz_mul (a[1], b[1], c[0]);
      mpz_addmul (a[1], b[0], c[1]);
      return;

    case 3:
      karatsuba (a + 2, b + 1, c + 1, 2, t);
      mpz_addmul (a[2], b[0], c[2]);
      mpz_addmul (a[2], b[2], c[0]);
      return;

    default:
      /* MULT is 2 for Karatsuba, 3 for Toom3, 4 for Toom4 */
      for (p = 1; MULT * p <= K; p *= MULT);
      p = (K / p) * p;
      q = K - p;
      LIST_MULT_N (a + 2 * q, b + q, c + q, p, t);
      if (q)
        {
          list_mul_high (t, b + p, c, q, t + 2 * q - 1);
          list_add (a + K - 1, a + K - 1, t + q - 1, q);
          list_mul_high (t, c + p, b, q, t + 2 * q - 1);
          list_add (a + K - 1, a + K - 1, t + q - 1, q);
        }
    }
#endif
}

/* Puts in a[0..2K-2] the product of b[0..K-1] and c[0..K-1].
   The auxiliary memory M(K) necessary in T satisfies:
   M(1)=0, M(K) = max(3*l-1,2*l-2+M(l)) <= 2*K-1 where l = ceil(K/2).
   Assumes K >= 1.
*/
void
karatsuba (listz_t a, listz_t b, listz_t c, unsigned int K, listz_t t)
{
  if (K == 1)
    {
      mpz_mul (a[0], b[0], c[0]);
    }
  else if (K == 2) /* basic Karatsuba scheme */
    {
      mpz_add (t[0], b[0], b[1]); /* t0 = b_0 + b_1 */
      mpz_add (a[1], c[0], c[1]); /* a1 = c_0 + c_1 */
      mpz_mul (a[1], a[1], t[0]); /* a1 = b_0*c_0 + b_0*c_1 + b_1*c_0 + b_1*c_1 */
      mpz_mul (a[0], b[0], c[0]); /* a0 = b_0 * c_0 */
      mpz_mul (a[2], b[1], c[1]); /* a2 = b_1 * c_1 */
      mpz_sub (a[1], a[1], a[0]); /* a1 = b_0*c_1 + b_1*c_0 + b_1*c_1 */
      mpz_sub (a[1], a[1], a[2]); /* a1 = b_0*c_1 + b_1*c_0 */
    }
  else if (K == 3)
    {
      /* implement Weimerskirch/Paar trick in 6 muls and 13 adds
         http://www.crypto.ruhr-uni-bochum.de/Publikationen/texte/kaweb.pdf */
      /* diagonal terms */
      mpz_mul (a[0], b[0], c[0]);
      mpz_mul (a[2], b[1], c[1]);
      mpz_mul (a[4], b[2], c[2]);
      /* (0,1) rectangular term */
      mpz_add (t[0], b[0], b[1]);
      mpz_add (t[1], c[0], c[1]);
      mpz_mul (a[1], t[0], t[1]);
      mpz_sub (a[1], a[1], a[0]);
      mpz_sub (a[1], a[1], a[2]);
      /* (1,2) rectangular term */
      mpz_add (t[0], b[1], b[2]);
      mpz_add (t[1], c[1], c[2]);
      mpz_mul (a[3], t[0], t[1]);
      mpz_sub (a[3], a[3], a[2]);
      mpz_sub (a[3], a[3], a[4]);
      /* (0,2) rectangular term */
      mpz_add (t[0], b[0], b[2]);
      mpz_add (t[1], c[0], c[2]);
      mpz_mul (t[2], t[0], t[1]);
      mpz_sub (t[2], t[2], a[0]);
      mpz_sub (t[2], t[2], a[4]);
      mpz_add (a[2], a[2], t[2]);
    }
  else
    { 
      unsigned int i, k, l;
      listz_t z;

      k = K / 2;
      l = K - k;

      z = t + 2 * l - 1;

      /* improved code with 7*k-3 additions, 
         contributed by Philip McLaughlin <mpbjr@qwest.net> */
      for (i = 0; i < k; i++)
        {
          mpz_sub (z[i], b[i], b[l+i]);
          mpz_sub (a[i], c[i], c[l+i]);
        }

      if (l > k) /* case K odd */
        {
          mpz_set (z[k], b[k]);
          mpz_set (a[k], c[k]);
        }

      /* as b[0..l-1] + b[l..K-1] is stored in t[2l-1..3l-2], we need
         here at least 3l-1 entries in t */

      karatsuba (t, z, a, l, a + l); /* fills t[0..2l-2] */
       
      /* trick: save t[2l-2] in a[2l-1] to enable M(K) <= 2*K-1 */
      z = t + 2 * l - 2;
      mpz_set (a[2*l-1], t[2*l-2]);

      karatsuba (a, b, c, l, z); /* fill a[0..2l-2] */
      karatsuba (a + 2 * l, b + l, c + l, k, z); /* fills a[2l..2K-2] */

      mpz_set (t[2*l-2], a[2*l-1]); /* restore t[2*l-2] */
      mpz_set_ui (a[2*l-1], 0);

      /*
	      l          l-1     1    l          2k-1-l
        _________________________________________________
	|    a0    |     a1    |0|    a2    |     a3    |
        -------------------------------------------------
              l          l-1
        ________________________
	|    t0    |     t1    |
        ------------------------

	We want to replace [a1, a2] by [a1 + a0 + a2 - t0, a2 + a1 + a3 - t1]
	i.e. [a12 + a0 - t0, a12 + a3 - t1] where a12 = a1 + a2.
       */

      list_add (a + 2 * l, a + 2 * l, a + l, l-1); /* a[2l..3l-1] <- a1+a2 */
      if (k > 1)
        {
          list_add (a + l, a + 2 * l, a, l); /* a[l..2l-1] <- a0 + a1 + a2 */
          list_add (a + 2 * l, a + 2 * l, a + 3 * l, 2 * k - 1 - l);
        }
      else /* k=1, i.e. K=2 or K=3, and a2 has only one entry */
        {
          mpz_add (a[l], a[2*l], a[0]);
          if (K == 3)
            mpz_set (a[l+1], a[1]);
        }

      list_sub (a + l, a + l, t, 2 * l - 1);
    }
}

/* multiplies b[0]+...+b[k-1]*x^(k-1)+x^k by c[0]+...+c[l-1]*x^(l-1)+x^l
   and puts the results in a[0]+...+a[k+l-1]*x^(k+l-1)
   [the leading monomial x^(k+l) is implicit].
   If monic_b (resp. monic_c) is 0, don't consider x^k in b (resp. x^l in c).
   Assumes k = l or k = l+1.
   The auxiliary array t contains at least list_mul_mem(l) entries.
   a and t should not overlap.
*/
static void
list_mul (listz_t a, listz_t b, unsigned int k, int monic_b,
          listz_t c, unsigned int l, int monic_c, listz_t t)
{
  unsigned int i, po2;

  ASSERTD (k >= l);
  
  for (po2 = l; (po2 & 1) == 0; po2 >>= 1);
  po2 = (po2 == 1);

#ifdef DEBUG
  if (Fermat && !(po2 && l == k))
    fprintf (ECM_STDOUT, "list_mul: Fermat number, but poly lengths %d and %d\n", k, l);
#endif

  if (po2 && Fermat)
    {
      if (monic_b && monic_c && l == k)
        {
          F_mul (a, b, c, l, MONIC, Fermat, t);
          monic_b = monic_c = 0;
        }
      else
        F_mul (a, b, c, l, DEFAULT, Fermat, t);
    }
  else
    LIST_MULT_N (a, b, c, l, t); /* set a[0]...a[2l-2] */

  if (k > l) /* multiply b[l]*x^l by c[0]+...+c[l-1]*x^(l-1) */
    {
      for (i = 0; i < l - 1; i++)
        mpz_addmul (a[l+i], b[l], c[i]);
      mpz_mul (a[2*l-1], b[l], c[l-1]);
    }

  /* deal with x^k and x^l */
  if (monic_b || monic_c)
    {
      mpz_set_ui (a[k + l - 1], 0);
      
      if (monic_b && monic_c) /* Single pass over a[] */
        {
          /* a += b * x^l + c * x^k, so a[i] += b[i-l]; a[i] += c[i-k] 
             if 0 <= i-l < k  or  0 <= i-k < l, respectively */
          if (k > l)
            mpz_add (a[l], a[l], b[0]);
          for (i = k; i < k + l; i++)
            {
              mpz_add (a[i], a[i], b[i-l]); /* i-l < k */
              mpz_add (a[i], a[i], c[i-k]); /* i-k < l */
            }
        }
      else if (monic_c) /* add b * x^l */
        list_add (a + l, a + l, b, k);

      else /* only monic_b, add x^k * c */
        list_add (a + k, a + k, c, l);
    }
}

/*
  Multiplies b[0..k-1] by c[0..k-1], stores the result in a[0..2k-2],
  and stores the reduced product in a2[0..2k-2].
  (Here, there is no implicit monic leading monomial.)
  Requires at least list_mul_mem(k) cells in t.
 */
void
list_mulmod (listz_t a2, listz_t a, listz_t b, listz_t c, unsigned int k,
              listz_t t, mpz_t n)
{
  int i;

  for (i = k; (i & 1) == 0; i >>= 1);
  
  ASSERTD(list_check(b,k,n));
  ASSERTD(list_check(c,k,n));
  if (i == 1 && Fermat)
    F_mul (a, b, c, k, DEFAULT, Fermat, t);
  else
    LIST_MULT_N (a, b, c, k, t); /* set a[0]...a[2l-2] */

  list_mod (a2, a, 2 * k - 1, n);
}

/* puts in G[0]..G[k-1] the coefficients from (x-a[0])...(x-a[k-1])
   Warning: doesn't fill the coefficient 1 of G[k], which is implicit.
   Needs k + list_mul_mem(k/2) cells in T.
*/
void
PolyFromRoots (listz_t G, listz_t a, unsigned int k, listz_t T, mpz_t n)
{
  unsigned int l, m;

   if (k <= 1)
     {
       mpz_mod (G[0], a[0], n);
       return;
     }

    /* (x-a[0]) * (x-a[1]) = x^2 - (a[0]+a[1]) * x + a[0]*a[1]
       however we construct (x+a[0]) * (x+a[1]) instead, i.e. the
       polynomial with the opposite roots. This has no consequence if
       we do it for all polynomials: if F(x) and G(x) have a common root,
       then so do F(-x) and G(-x). This saves one negation.
     */
   if (k == 2)
     {
       mpz_mul (T[0], a[0], a[1]);
       mpz_add (G[1], a[1], a[0]);
       mpz_mod (G[1], G[1], n);
       mpz_mod (G[0], T[0], n);
       return;
     }

   m = k / 2;
   l = k - m;

   PolyFromRoots (G, a, l, T, n);
   PolyFromRoots (G + l, a + l, m, T, n);
   list_mul (T, G, l, 1, G + l, m, 1, T + k);
   list_mod (G, T, k, n);
}

/* puts in G[0]..G[k-1] the coefficients from (x-a[0])...(x-a[k-1])
   Warning: doesn't fill the coefficient 1 of G[k], which is implicit.
   Needs k + list_mul_mem(k/2) cells in T.
   The product tree is stored in:
   G[0..k-1]       (degree k)
   Tree[0][0..k-1] (degree k/2)
   Tree[1][0..k-1] (degree k/4), ...,
   Tree[lgk-1][0..k-1] (degree 1)
   (then we should have initially Tree[lgk-1] = a).

   depth is the depth (0 at root), dolvl signals that only this level of
   the tree should be computed (-1: all levels)
*/
int
PolyFromRoots_Tree (listz_t G, listz_t a, unsigned int k, listz_t T, 
               int dolvl, mpz_t n, listz_t *Tree, FILE *TreeFile, 
               unsigned int sh)
{
  unsigned int l, m;
  listz_t H1, *NextTree;

  if (k <= 1)
    {
      if (k == 1)
        mpz_set (G[0], a[0]);
      return 0;
    }

  if (Tree == NULL)
    {
      H1 = G;
      NextTree = NULL;
    }
  else
    {
      H1 = Tree[0] + sh;
      NextTree = Tree + 1;
    }

#if 0
  /* (x-a[0]) * (x-a[1]) = x^2 - (a[0]+a[1]) * x + a[0]*a[1]
     however we construct (x+a[0]) * (x+a[1]) instead, i.e. the
     polynomial with the opposite roots. This has no consequence if
     we do it for all polynomials: if F(x) and G(x) have a common root,
     then so do F(-x) and G(-x). This saves one negation.
  */
  if (k == 2)
    {
      mpz_set (H1[0], a[0]);
      mpz_set (H1[1], a[1]);
      mpz_mul (T[0], a[0], a[1]);
      mpz_add (G[1], a[1], a[0]);
      mpz_mod (G[1], G[1], n);
      mpz_mod (G[0], T[0], n);
      return 0;
    }
#endif

  m = k / 2;
  l = k - m;
  
  if (dolvl < 0 || dolvl > 0)
    {
      PolyFromRoots_Tree (H1, a, l, T, dolvl - 1, n, NextTree, TreeFile, sh);
      PolyFromRoots_Tree (H1 + l, a + l, m, T, dolvl - 1, n, NextTree, 
                          TreeFile, sh + l);
    }
  if (dolvl < 0 || dolvl == 0)
    {
      /* Write this level to disk, if requested */
      if (TreeFile != NULL)
        {
          if (list_out_raw (TreeFile, H1, l) == ECM_ERROR ||
              list_out_raw (TreeFile, H1 + l, m) == ECM_ERROR)
            {
              outputf (OUTPUT_ERROR, "Error writing product tree of F\n");
              return ECM_ERROR;
            }
        }
      list_mul (T, H1, l, 1, H1 + l, m, 1, T + k);
      list_mod (G, T, k, n);
    }
  
  return 0; 
}

/* puts in q[0..K-1] the quotient of x^(2K-2) by B
   where B = b[0]+b[1]*x+...+b[K-1]*x^(K-1) with b[K-1]=1.
*/
void
PolyInvert (listz_t q, listz_t b, unsigned int K, listz_t t, mpz_t n)
{
  if (K == 1)
    {
      mpz_set_ui (q[0], 1);
      return;
    }
  else
    {
      int k, l, po2, use_middle_product = 0;

#ifdef KS_MULTIPLY
      use_middle_product = 1;
#endif

      k = K / 2;
      l = K - k;

      for (po2 = K; (po2 & 1) == 0; po2 >>= 1);
      po2 = (po2 == 1 && Fermat != 0);

      /* first determine l most-significant coeffs of Q */
      PolyInvert (q + k, b + k, l, t, n); /* Q1 = {q+k, l} */

      /* now Q1 * B = x^(2K-2) + O(x^(2K-2-l)) = x^(2K-2) + O(x^(K+k-2)).
         We need the coefficients of degree K-1 to K+k-2 of Q1*B */

      ASSERTD(list_check(q+k,l,n) && list_check(b,l,n));
      if (po2 == 0 && use_middle_product)
        {
          TMulKS (t, k - 1, q + k, l - 1, b, K - 1, n, 0);
          list_neg (t, t, k, n);
        }
      else if (po2)
        {
          list_revert (q + k, l - 1);
          /* This expects the leading monomials explicitly in q[2k-1] and b[k+l-1] */
          F_mul_trans (t, q + k, b, K, Fermat, t + k);
          list_revert (q + k, l - 1);
          list_neg (t, t, k, n);
        }
      else
        {
          LIST_MULT_N (t, q + k, b, l, t + 2 * l - 1); /* t[0..2l-1] = Q1 * B0 */
          list_neg (t, t + l - 1, k, n);
      
          if (k > 1)
            {
              list_mul (t + k, q + k, l - 1, 1, b + l, k - 1, 1,
			t + k + K - 2); /* Q1 * B1 */
              list_sub (t + 1, t + 1, t + k, k - 1);
            }
        }
      list_mod (t, t, k, n); /* high(1-B*Q1) */

      ASSERTD(list_check(t,k,n) && list_check(q+l,k,n));
      if (po2)
        F_mul (t + k, t, q + l, k, DEFAULT, Fermat, t + 3 * k);
      else
        LIST_MULT_N (t + k, t, q + l, k, t + 3 * k - 1);
      list_mod (q, t + 2 * k - 1, k, n);
    }
}

/*
  divides a[0]+a[1]*x+...+a[2K-1]*x^(2K-1)
  By b[0]+b[1]*x+...+b[K-1]*x^(K-1)+x^K
  i.e. a polynomial of 2K coefficients divided by a monic polynomial
  with K+1 coefficients (b[K]=1 is implicit).
  Puts the quotient in q[0]+q[1]*x+...+q[K-1]*x^(K-1)
  and the remainder in a[0]+a[1]*x+...+a[K-1]*x^(K-1)
  Needs space for list_mul_mem(K) coefficients in t.
  If top is non-zero, a[0]..a[K-1] are reduced mod n.
*/
void
RecursiveDivision (listz_t q, listz_t a, listz_t b, unsigned int K,
                   listz_t t, mpz_t n, int top)
{
  if (K == 1) /* a0+a1*x = a1*(b0+x) + a0-a1*b0 */
    {
      mpz_mod (a[1], a[1], n);
      mpz_mul (q[0], a[1], b[0]);
      mpz_mod (q[0], q[0], n);
      mpz_sub (a[0], a[0], q[0]);
      if (top)
        mpz_mod (a[0], a[0], n);
      mpz_set (q[0], a[1]);
    }
  else
    {
      unsigned int k, l, i, po2;

      k = K / 2;
      l = K - k;
      for (po2 = K; (po2 && 1) == 0; po2 >>= 1);
      po2 = (po2 == 1);

      /* first perform a (2l) / l division */
      RecursiveDivision (q + k, a + 2 * k, b + k, l, t, n, 0);
      /* subtract q[k..k+l-1] * b[0..k-1] */
      ASSERTD(list_check(q+l,k,n) && list_check(b,k,n));
      if (po2 && Fermat)
        F_mul (t, q + l, b, k, DEFAULT, Fermat, t + K); /* sets t[0..2*k-2]*/
      else
        LIST_MULT_N (t, q + l, b, k, t + K - 1); /* sets t[0..2*k-2] */
      list_sub (a + l, a + l, t, 2 * k - 1);
      if (k < l) /* don't forget to subtract q[k] * b[0..k-1] */
        {
	  for (i=0; i<k; i++)
	    {
	      mpz_mul (t[0], q[k], b[i]); /* TODO: need to reduce t[0]? */
	      mpz_sub (a[k+i], a[k+i], t[0]);
	    }
        }
      /* remainder is in a[0..K+k-1] */

      /* then perform a (2k) / k division */
      RecursiveDivision (q, a + l, b + l, k, t, n, 0);
      /* subtract q[0..k-1] * b[0..l-1] */
      ASSERTD(list_check(q,k,n) && list_check(b,k,n));
      if (po2 && Fermat)
        F_mul (t, q, b, k, DEFAULT, Fermat, t + K);
      else
        LIST_MULT_N (t, q, b, k, t + K - 1);
      list_sub (a, a, t, 2 * k - 1);
      if (k < l) /* don't forget to subtract q[0..k-1] * b[k] */
        {
          for (i=0; i<k; i++)
            {
              mpz_mul (t[0], q[i], b[k]); /* TODO: need to reduce t[0]? */
              mpz_sub (a[k+i], a[k+i], t[0]);
            }
        }

      /* normalizes the remainder wrt n */
      if (top)
        list_mod (a, a, K, n);
    }
}

/*
  Returns in a[0]+a[1]*x+...+a[K-1]*x^(K-1)
  the remainder of the division of
  A = a[0]+a[1]*x+...+a[2K-2]*x^(2K-2)
  by B = b[0]+b[1]*x+...+b[K-1]*x^(K-1)+b[K]*x^K with b[K]=1 *explicit*.
  (We have A = Q*B + R with deg(Q)=K-2 and deg(R)=K-1.)
  Assumes invb[0]+invb[1]*x+...+invb[K-2]*x^(K-2) equals Quo(x^(2K-2), B).
  Assumes K >= 2.
  Requires 2K-1 + list_mul_mem(K) cells in t.

  Notations: R = r[0..K-1], A = a[0..2K-2], low(A) = a[0..K-1],
  high(A) = a[K..2K-2], Q = t[0..K-2]
  Return non-zero iff an error occurred.
*/
int
PrerevertDivision (listz_t a, listz_t b, listz_t invb,
                   unsigned int K, listz_t t, mpz_t n)
{
  int po2, wrap;
  listz_t t2 = NULL;
#ifdef WRAP
  wrap = ks_wrapmul_m (K + 1, K + 1, n) <= 2 * K - 1 + list_mul_mem (K);
#else
  wrap = 0;
#endif

  /* Q <- high(high(A) * INVB) with a short product */
  for (po2 = K; (po2 & 1) == 0; po2 >>= 1);
  po2 = (po2 == 1);
  if (Fermat && po2)
    {
      mpz_set_ui (a[2 * K - 1], 0);
      if (K <= 4 * Fermat)
        {
          F_mul (t, a + K, invb, K, DEFAULT, Fermat, t + 2 * K);
          /* Put Q in T, as we still need high(A) later on */
          list_mod (t, t + K - 2, K, n);
        }
      else
        {
          F_mul (t, a + K, invb, K, DEFAULT, Fermat, t + 2 * K);
          list_mod (a + K, t + K - 2, K, n);
        }
    }
  else /* non-Fermat case */
    {
      list_mul_high (t, a + K, invb, K - 1, t + 2 * K - 3);
      /* the high part of A * INVB is now in {t+K-2, K-1} */
      if (wrap)
	{
	  t2 = init_list (K - 1);
	  if (t2 == NULL)
	    {
	      fprintf (ECM_STDERR, "Error, not enough memory\n");
	      return ECM_ERROR;
	    }
	  list_mod (t2, t + K - 2, K - 1, n);
	}
      else /* we can store in high(A) which is no longer needed */
	list_mod (a + K, t + K - 2, K - 1, n);
    }

  /* the quotient Q = trunc(A / B) has degree K-2, i.e. K-1 terms */

  /* T <- low(Q * B) with a short product */
  mpz_set_ui (a[2 * K - 1], 0);
  if (Fermat && po2)
    {
      if (K <= 4 * Fermat)
        {
          /* Multiply without zero padding, result is (mod x^K - 1) */
          F_mul (t + K, t, b, K, NOPAD, Fermat, t + 2 * K);
          /* Take the leading monomial x^K of B into account */
          list_add (t, t + K, t, K);
          /* Subtract high(A) */
          list_sub(t, t, a + K, K);
        }
      else
        F_mul (t, a + K, b, K, DEFAULT, Fermat, t + 2 * K);
    }
  else /* non-Fermat case */
    {
#ifdef KS_MULTIPLY /* ks is faster */
      if (wrap)
        /* Q = {t2, K-1}, B = {b, K+1}
           We know that Q*B vanishes with the coefficients of degree
           K to 2K-2 of {A, 2K-1} */
        {
          unsigned int m;
          m = ks_wrapmul (t, K + 1, b, K + 1, t2, K - 1, n);
          clear_list (t2, K - 1);
          /* coefficients of degree m..2K-2 wrap around,
             i.e. were subtracted to 0..2K-2-m */
          if (m < 2 * K - 1) /* otherwise product is exact */
            list_add (t, t, a + m, 2 * K - 1 - m);
        }
      else
        LIST_MULT_N (t, a + K, b, K, t + 2 * K - 1);
#else
      list_mul_low (t, a + K, b, K, t + 2 * K - 1, n);
#endif
    }

  /* now {t, K} contains the low K terms from Q*B */
  list_sub (a, a, t, K);
  list_mod (a, a, K, n);

  return 0;
}

/* Puts in inv[0..l-1] the inverses of a[0..l-1] (mod n), using 3*(l-1) 
   multiplies and one gcdext.
   Returns 1 if a factor was found (stored in t), 0 otherwise.
*/
int
list_invert (listz_t inv, listz_t a, unsigned int l, mpz_t t, mpmod_t modulus)
{
  unsigned int i;
  
  if (l == 0)
    return 0;
  
  mpz_set (inv[0], a[0]);
  
  for (i = 1; i < l; i++)
    {
      mpz_mul (t, inv[i-1], a[i]);
      mpz_mod (inv[i], t, modulus->orig_modulus); /* inv[i] = a[0]*...*a[i] */
    }
  
  mpz_gcdext (t, inv[l-1], NULL, inv[l-1], modulus->orig_modulus);
  
  if (mpz_cmp_ui (t, 1) != 0)
    return 1;
  
  for (i = l-1; i > 0; i--)
    {
      mpz_mul (t, inv[i], inv[i-1]); /* t = (a[0]*...*a[i])^(-1) * (a[0]*...*a[i-1]) = a[i]^(-1) */
      mpz_mul (inv[i-1], inv[i], a[i]); /* inv[i-1] = (a[0]*...*a[i])^(-1) * a[i] = (a[0]*...*a[i-1])^(-1) */
      mpz_mod (inv[i-1], inv[i-1], modulus->orig_modulus);
      mpz_mod (inv[i], t, modulus->orig_modulus);
    }
  
  return 0;
}
