/* interpo.c : Extention library regarding interpolation of GPhys
 */

#include<stdio.h>
#include<string.h>
#include "ruby.h"
#include "narray.h"

#ifndef RARRAY_PTR
#  define RARRAY_PTR(ary) (RARRAY(ary)->ptr)
#endif
#ifndef RARRAY_LEN
#  define RARRAY_LEN(ary) (RARRAY(ary)->len)
#endif

/* for compatibility for NArray and NArray with big memory patch */
#ifndef NARRAY_BIGMEM
typedef int na_shape_t;
#endif

static VALUE
interpo_do(obj, shape_to, idxmap, val_from, missval, extrapo)
     VALUE obj;
     VALUE shape_to; // [Array] shape of the new grid
     VALUE idxmap;   // [Array] index mapping
     VALUE val_from; // [NArray] values to be interpolated
     VALUE missval;  // [nil or Float] missing value if Float
     VALUE extrapo; // [false/true] whether to extrapolate
{
    VALUE val_to;
    VALUE chk , ary;
    struct NARRAY *naf;
    VALUE vm;
    int natype;
    na_shape_t *sht;  // shape to
    na_shape_t *shf;  // shape from
    na_shape_t cshf;
    na_shape_t rankf, rankt, lent;
    double *pt, *pf;
    int nomiss;
    double vmiss;
    int extr;
    na_shape_t i, it, kf, j, k, l;
    int *mtyp;
    na_shape_t *dm, *dm2, *dm2f, *dm3, *dm3f, *dm4, *dm4f;
    int **di, nid, nic;
    double **df, *f, a;
    int *idf, idfc, *idt;

    // check arguments

    if (TYPE(shape_to) != T_ARRAY){
        rb_raise(rb_eTypeError, "1st arg must be an array");
    }
    if (TYPE(idxmap) != T_ARRAY){
        rb_raise(rb_eTypeError, "2nd arg must be an array");
    }

    chk = rb_obj_is_kind_of(val_from, cNArray);
    if (chk == Qfalse) {
	rb_raise(rb_eTypeError, "3rd arg must be an NArray");
    }

    // read argument (shape_to)

    rankt = RARRAY_LEN(shape_to);
    sht = ALLOCA_N(na_shape_t, rankt);
    for(i=0; i<rankt; i++){
	sht[i] = NUM2INT( RARRAY_PTR(shape_to)[i] );
    }

    // read argument (val_from)

    natype = NA_TYPE(val_from);  // saved
    val_from = na_cast_object(val_from, NA_DFLOAT);
    rankf = NA_RANK(val_from);
    GetNArray(val_from, naf);
    shf = naf->shape;
    pf = NA_PTR_TYPE(val_from, double *);

    // read argument (missval)

    nomiss = (missval == Qnil);
    if (!nomiss){
	vmiss = NUM2DBL(missval);
    } else {
	vmiss = -999.0;   // dummy to avoid compiler warning (not used)
    }
    extr = (extrapo != Qfalse);  // false -> 0(false); else -> true

    // read argument (idxmap) 

    if (RARRAY_LEN(idxmap) != rankf){
        rb_raise(rb_eArgError, "length of 2nd arg and rank of 3rd arg must agree");
    }

    mtyp = ALLOCA_N(int,    rankf);
    dm   = ALLOCA_N(na_shape_t, rankf);
    dm2  = ALLOCA_N(na_shape_t, rankf);
    dm2f = ALLOCA_N(na_shape_t, rankf);
    dm3  = ALLOCA_N(na_shape_t, rankf);
    dm3f = ALLOCA_N(na_shape_t, rankf);
    dm4  = ALLOCA_N(na_shape_t, rankf);
    dm4f = ALLOCA_N(na_shape_t, rankf);
    di   = ALLOCA_N(int *,    rankf);
    df   = ALLOCA_N(double *, rankf);
    nid = 0;
    for(i=0; i<rankf; i++){
	vm = RARRAY_PTR(idxmap)[i];
	if (RARRAY_LEN(vm) == 1) {
	    mtyp[i] = 0;   // simple copying
	    dm[i]  = NUM2INT( RARRAY_PTR(vm)[0] ); 
	} else if ( RARRAY_PTR(vm)[1] == Qnil ) {
	    mtyp[i] = 1;   // mapping from 1D
	    dm[i]  = NUM2INT( RARRAY_PTR(vm)[0] ); 
	    di[i] = NA_PTR_TYPE( RARRAY_PTR(vm)[2], int *);
	    df[i] = NA_PTR_TYPE( RARRAY_PTR(vm)[3], double *);
	    nid++;
	} else {
	    dm[i]  = NUM2INT( RARRAY_PTR(vm)[0] ); 
	    ary = RARRAY_PTR(vm)[1];
	    mtyp[i] = RARRAY_LEN(ary)+1;   // mapping from multi-D
	    switch( mtyp[i] ){
	    case 4:  // do not break
		dm4[i] = NUM2INT( RARRAY_PTR(ary)[2] ); 
		dm4f[i]= sht[NUM2INT( RARRAY_PTR(ary)[1] )]; 
	    case 3:  // do not break
		dm3[i] = NUM2INT( RARRAY_PTR(ary)[1] ); 
		dm3f[i]= sht[NUM2INT( RARRAY_PTR(ary)[0] )];
	    case 2:
		dm2[i] = NUM2INT( RARRAY_PTR(ary)[0] ); 
		dm2f[i]= sht[dm[i]]; 
	    }
	    di[i] = NA_PTR_TYPE( RARRAY_PTR(vm)[2], int *);
	    df[i] = NA_PTR_TYPE( RARRAY_PTR(vm)[3], double *);
	    nid++;
	}
    }

    f = ALLOCA_N(double, nid);
    nic = 1 << nid ;   // ==> 2**nid

    // prepare output object

    val_to = na_make_object(NA_DFLOAT, rankt, sht, cNArray);  // double for a momnent
    lent = NA_TOTAL(val_to);
    pt = NA_PTR_TYPE(val_to, double *);


    // do interpolation

    idt  = ALLOCA_N(int, rankt);
    idf  = ALLOCA_N(int, rankf);

    for(it=0; it<lent; it++){
	l = it;
	for(j=0; j<rankt; j++){
	    idt[j] = l % sht[j];
	    l /= sht[j];
	}
	k = 0;
	for(j=0; j<rankf; j++){
	    switch(mtyp[j]){
	    case 0:
		idf[j] = idt[dm[j]];
		break;
	    case 1:
		idf[j] = di[j][ idt[dm[j]] ];
		f[k] = df[j][ idt[dm[j]] ];
		k++;
		break;
	    case 2:
		idf[j] = di[j][ idt[dm[j]] + dm2f[j]*idt[dm2[j]] ];
		f[k] = df[j][ idt[dm[j]] + dm2f[j]*idt[dm2[j]] ];
		k++;
		break;
	    case 3:
                idf[j] = di[j][ idt[dm[j]] + 
			      dm2f[j]*(idt[dm2[j]] + dm3f[j]*idt[dm3[j]]) ];
		f[k] = df[j][ idt[dm[j]] + 
			      dm2f[j]*(idt[dm2[j]] + dm3f[j]*idt[dm3[j]]) ];
		k++;
		break;
	    case 4:
		idf[j] = di[j][ idt[dm[j]] + 
			      dm2f[j]*(idt[dm2[j]] + dm3f[j]*
				       (idt[dm3[j]]+dm4f[j]*idt[dm4[j]])) ];
		f[k] = df[j][ idt[dm[j]] + 
			      dm2f[j]*(idt[dm2[j]] + dm3f[j]*
				       (idt[dm3[j]]+dm4f[j]*idt[dm4[j]])) ];
		k++;
		break;
	    }
	}
	pt[it] = 0.0;
	for(l=0; l<nic; l++){   // loop for 2**nid times
	    a = 1.0;
	    for(k=0; k<nid; k++){
		a = ( (l>>k)%2 ? a*f[k] : a*(1.0 - f[k]) );
	    }
	    cshf=1;
	    kf = 0;
	    k = 0;
	    for(j=0; j<rankf; j++){
		idfc = idf[j];
                if(!extr && idfc<0){
                    pt[it] = vmiss;
                    break;
                }
		if(mtyp[j]>0){
                    if ( (l>>k)%2 && idfc<shf[j]-1 ){
                        idfc += 1;
                    }
		    k++;
		}
		kf += idfc*cshf;
		cshf *= shf[j];
	    }
            if (pt[it] == vmiss) {break;}
	    if (nomiss || pf[kf] != vmiss){
		pt[it] += a*pf[kf];
	    } else {
		pt[it] = vmiss;
		break;
	    }
            //printf("$$$$ %d %f %f\n",it, a, pf[kf]);
	}
        //printf("//// %d %f\n",it, pt[it]);
    }

    // finish
    val_to = na_cast_object(val_to, natype);
    return val_to;
}

static void
__interpo_find_loc_1D(N, P, n, p, vmiss, extr, ids, f)
     double *P, *p;   // INPUT
     na_shape_t N, n;     // INPUT
     double vmiss;    // INPUT
     int extr;        // INPUT
     int *ids;        // OUTPUT
     double *f;       // OUTPUT
{
    na_shape_t j;
    int i, il, ir;
    int down;

    // first time finding : use a simple looping 
    j = 0;
    for(i=0; i<n-1; i++){
	if ( p[i] != vmiss && p[i+1] != vmiss && 
             (p[i]-P[j])*(P[j]-p[i+1]) >= 0 ){
	    break;
	}
    }
    if (i<n-1){
	// found
	ids[j] = i;
    } else if (extr) {
	// not found --> to be extrapolated
	if ( (P[j]-p[0])*(p[0]-p[n-1]) >= 0 ){
	    ids[j] = i = 0;
	} else {
	    ids[j] = i = n-2;
	}
    } else {
        ids[j] = -999999;   // a negative value
        i = 0;
    }
    f[j] = (p[i]-P[j])/(p[i]-p[i+1]);

    // second or later time finding : start from the previous position
    for(j=1; j<N; j++){
	down = 1; // true : move i downward next time
	il = ir = i;
	while (1){
	    if ( (p[i]-P[j])*(P[j]-p[i+1]) >= 0 ) {
		break;
	    } else {
		if ( il>0 && ( down || ir==n-2 )  ){
		    il--;
		    i = il;
		    down = 0; // false
		} else if ( ir<n-2 && ( !down || il==0 ) ){
		    ir++;
		    i = ir;
		    down = 1; // true
		} else {
		    // not found
                    if (extr) {
                        // to be extrapolated
                        if ( (P[j]-p[0])*(p[0]-p[n-1]) >= 0 ){
                            i = 0;
                        } else {
                            i = n-2;
                        }
                    } else {
                        i = -999999;  // a negative value (changed to 0 below)
		    }
		    break;
		}
	    }
	}
	ids[j] = i;
        if(i<0){i = 0; down = 0;}
	f[j] = (p[i]-P[j])/(p[i]-p[i+1]);
    }
}

static VALUE
interpo_find_loc_1D(obj, X, x, missval, extrapo)
     VALUE obj;
     VALUE X; // [NArray(1D)] coordinate values onto which interpolation is made
     VALUE x; // [NArray(1D)] coordinate values of original data
     VALUE missval; // [Float] missing value in x
     VALUE extrapo; // [false/true] whether to extrapolate
{
    VALUE na_ids, na_f;
    VALUE chk;
    na_shape_t N, n;
    struct NARRAY *na;
    double *P, *p, vmiss;
    int extr;
    int *ids;
    double *f;
    na_shape_t shape[1];

    // check and unwrap the input arguments

    chk = rb_obj_is_kind_of(X, cNArray);
    if (chk == Qfalse) {rb_raise(rb_eTypeError, "expect NArray (1st arg)");}

    chk = rb_obj_is_kind_of(x, cNArray);
    if (chk == Qfalse) {rb_raise(rb_eTypeError, "expect NArray (2nd arg)");}

    X = na_cast_object(X, NA_DFLOAT);
    GetNArray(X, na);
    N = na->total;
    P = (double *)NA_PTR(na, 0);

    x = na_cast_object(x, NA_DFLOAT);
    GetNArray(x, na);
    n = na->total;
    p = (double *)NA_PTR(na, 0);

    vmiss = NUM2DBL(missval);

    extr = (extrapo != Qfalse);  // false -> 0(false); else -> true

    // prepare output NArrays

    shape[0] = N;
    na_ids = na_make_object(NA_LINT, 1, shape, cNArray);
    GetNArray(na_ids, na);
    ids = (int *) NA_PTR(na, 0);
    na_f = na_make_object(NA_DFLOAT, 1, shape, cNArray);
    GetNArray(na_f, na);
    f = (double *) NA_PTR(na, 0);

    // Do the job

    __interpo_find_loc_1D(N, P, n, p, vmiss, extr, ids, f);

    // Return

    return rb_ary_new3(2, na_ids, na_f);
}

/* To apply interpo_find_loc_1D multi-dimensionally 
 */
static VALUE
interpo_find_loc_1D_MD(obj, X, x, dimc, missval, extrapo)
     VALUE obj;
     VALUE X; // [NArray(1D)] coordinate values onto which interpolation is made
     VALUE x; // [NArray(multi-D)] coordinate values of original data
     VALUE dimc; // [Integer] the dimension in x except which mapping has been set
     VALUE missval; // [Float] missing value in x
     VALUE extrapo; // [false/true] whether to extrapolate
{
    VALUE na_ids, na_f;
    VALUE chk;
    na_shape_t N, n1;
    struct NARRAY *na;
    double *P, *p, *p1, vmiss;
    int extr;
    int *ids, *ids1;
    double *f, *f1;
    na_shape_t *shr;  // shape of the result
    na_shape_t *shl;  // shape of multi-D loop
    na_shape_t *shx;  // shape of x
    int dmc; // dimc
    int rank, dl, dc;
    na_shape_t *cshxl, *cshrl, *cshl; // cumulative shapes for looping
    na_shape_t fxl;    // same but only for the dimension treated here
    na_shape_t il, i, j, ix, ir, totl;

    // check and unwrap the input arguments

    chk = rb_obj_is_kind_of(X, cNArray);
    if (chk == Qfalse) {rb_raise(rb_eTypeError, "expect NArray (1st arg)");}

    chk = rb_obj_is_kind_of(x, cNArray);
    if (chk == Qfalse) {rb_raise(rb_eTypeError, "expect NArray (2nd arg)");}

    X = na_cast_object(X, NA_DFLOAT);
    GetNArray(X, na);
    N = na->total;
    P = (double *)NA_PTR(na, 0);

    dmc = NUM2INT( dimc ); 

    vmiss = NUM2DBL(missval);

    extr = (extrapo != Qfalse);  // false -> 0(false); else -> true

    x = na_cast_object(x, NA_DFLOAT);
    GetNArray(x, na);
    p = (double *)NA_PTR(na, 0);
    shx = na->shape; 
    n1 = shx[dmc];

    rank = NA_RANK(x);

    if (dmc<0 || dmc>=rank){
        rb_raise(rb_eArgError, "Specified dimension (4th argument) is outside the dims of the multi-D coordinate variable");
    }

    // prepare output NArrays

    shl = ALLOCA_N(na_shape_t, rank-1);
    shr = ALLOCA_N(na_shape_t, rank);
    shr[0] = N;
    totl = 1;
    for(dl=0,dc=0; dl<rank-1; dl++,dc++){
	if(dc==dmc){dc++;}  // to skip shx[dmc]
	shr[dl+1] = shl[dl] = shx[dc]; 
	totl *= shl[dl];
    }

    cshl = ALLOCA_N(na_shape_t, rank-1);
    cshl[0] = 1;
    for(dl=1; dl<rank-1; dl++){
	cshl[dl] = cshl[dl-1]*shl[dl-1];
    }

    cshxl = ALLOCA_N(na_shape_t, rank-1);
    if(dmc==0) {
 	fxl = 1;
	cshxl[0] = shx[0];
    } else {
 	fxl = shx[0];
	cshxl[0] = 1;
    }

    for(dl=1,dc=1; dl<rank-1; dl++,dc++){
	if(dc==dmc){
	    fxl = cshxl[dl-1]*shx[dc-1];
	    dc++;
            cshxl[dl] = fxl*shx[dc-1];
	} else {
            cshxl[dl] = cshxl[dl-1]*shx[dc-1];
        }
    }
    if (dmc==rank-1){
	fxl = cshxl[rank-2]*shx[rank-2];
    }

    cshrl = ALLOCA_N(na_shape_t, rank-1);
    cshrl[0] = shr[0];
    for(dl=1; dl<rank-1; dl++){
	cshrl[dl] = cshrl[dl-1]*shr[dl];
    }

    na_ids = na_make_object(NA_LINT, rank, shr, cNArray);
    GetNArray(na_ids, na);
    ids = (int *) NA_PTR(na, 0);
    na_f = na_make_object(NA_DFLOAT, rank, shr, cNArray);
    GetNArray(na_f, na);
    f = (double *) NA_PTR(na, 0);

    // Do the job

    p1 =  ALLOCA_N(double, n1);
    ids1 =  ALLOCA_N(int, N);
    f1 =  ALLOCA_N(double, N);
    for(il=0; il<totl; il++){

	// put a 1D subset of p into p1 (p1 = p[..., true,...])

	ix = 0;
	ir = 0;
	for(dl=0; dl<rank-1; dl++){
	    i = (il/cshl[dl]) % shl[dl];
	    ix += cshxl[dl]*i;
	    ir += cshrl[dl]*i;
	}

	for(j=0; j<n1; j++){
	    p1[j] = p[ix+fxl*j];
	}

	// find loc in 1D

	__interpo_find_loc_1D(N, P, n1, p1, vmiss, extr, ids1, f1);

	// substitute ids1 and f1 (1D) into ids and f (multi-D)
 
	for(j=0; j<N; j++){
	    ids[ir+j] = ids1[j];
	    f[ir+j] = f1[j];
	    //printf("  %d %f\n",ids1[j],f1[j]);
	}
	//printf("\n");

    }

    // Return

    return rb_ary_new3(2, na_ids, na_f);
}

/**
 * @brief interpolate missing
 * 
 * @details
 * input data is converted into double (NArray's Float) type. So unuable for
 * complex data.
 * 
 * Supports cyclic interpolation. Extrapolation is not supported (so far).
 * 
 * Returned is a two-element Array consisting of the interpolated NArray (rval)
 * and the updated mask (rmask).
 */
static VALUE
interpo_missing(obj, val, mask, dim, pos, cyclic, modulo)
     VALUE obj;
     VALUE val; // [NArray] real/int (not complex) NArray with data missing
     VALUE mask;  // [NArray (byte)] mask array (shape same as val)
     VALUE dim;  // [Integer] dimension along which missing is interpolated
     VALUE pos; // [1D NArray] grid point positions alog dim
     VALUE cyclic; // [false/true] whether cyclic extension is conducted
     VALUE modulo; // [Float] modulo used when cyclic
     // VALUE extrapo; // [false/true] whether to extrapolate when !cyclic
{
    VALUE rval, rmask, rary; // result
    struct NARRAY *na;
    na_shape_t rank;
    na_shape_t *sh, sh0=1, sh1, sh2=1, i, j, k, js, je;
    int dm, d;
    double *y, ys, ye, *x, xs, xe, xc, mod;
    u_int8_t *mk;
 
    rval = na_dup_w_type(val, NA_DFLOAT);
    rank = NA_RANK(rval);
    GetNArray(rval, na);
    sh = na->shape;
    y = NA_PTR_TYPE(rval, double *);

    if ( NA_TOTAL(mask) != NA_TOTAL(val) )
        rb_raise(rb_eArgError, "The lengths of val and mask disgagree.");
    if ( na_get_typecode(mask) != NA_BYTE) 
        rb_raise(rb_eArgError, "mask must be a byte NArray");
    rmask = na_clone(mask);
    mk = NA_PTR_TYPE(rmask, u_int8_t *);

    rary = rb_ary_new();
    rb_ary_push(rary, rval);
    rb_ary_push(rary, rmask);
    
    if (cyclic == Qtrue) mod = NUM2DBL(modulo);
    
    dm = NUM2INT(dim);
    if (dm < 0) dm += rank;
    if (dm < 0 || dm >= rank)
        rb_raise(rb_eArgError, "dim %d does not exist in the rank %d array",
                 NUM2INT( dim ), rank);
    for (d=0; d<dm; d++) sh0 *= sh[d];
    sh1 = sh[dm];
    for (d=dm+1; d<rank; d++) sh2 *= sh[d];

    if ( NA_TOTAL(pos) != sh1 )
        rb_raise(rb_eArgError, "len of pos (%d) != len along dim (%d)",
                 NA_TOTAL(rval), sh1);
    pos = na_cast_object(pos, NA_DFLOAT);
    x = NA_PTR_TYPE(pos, double *);
    for (k=0; k<sh2; k++){
        for (i=0; i<sh0; i++){
            js = -1;
            je = 0;
            while ( je < sh1 ) {
                for( j=je; j<sh1-1; j++ ) {
                    if ( mk[i+sh0*(j+sh1*k)] &&
                         !mk[i+sh0*((j+1)+sh1*k)] ) {
                        js = j;
                        break;
                    }
                }
                if (js >= je) {
                    for( j=js+1; j<sh1-1; j++ ) {
                        if ( !mk[i+sh0*(j+sh1*k)] &&
                             mk[i+sh0*((j+1)+sh1*k)] ){
                            je = j+1;
                            break;
                        }
                    }
                    if (je > js+1) {
                        // interpolate between js & je
                        xs = x[js];
                        xe = x[je];
                        ys = y[i+sh0*(js+sh1*k)];
                        ye = y[i+sh0*(je+sh1*k)];
                        for( j=js+1; j<je; j++ ) {
                            y[i+sh0*(j+sh1*k)] =
                                ( (xe-x[j])*ys + (x[j]-xs)*ye ) / (xe-xs);
                            mk[i+sh0*(j+sh1*k)] = (u_int8_t) 1;
                        }
                    } else {
                        break; // no non-cyclic gap remains -> break
                    }
                } else {
                    break; // no (further) non-cyclic gap found -> break
                }
            }
            if (cyclic == Qtrue && ( !mk[i+sh0*(0     +sh1*k)] ||
                                     !mk[i+sh0*(sh1-1 +sh1*k)] ) ) {
                // the first or last element remains as missing
                js = je = -1;
                for( j=sh1-1; j>=0; j-- ) {
                    if ( mk[i+sh0*(j+sh1*k)] ) {
                        js = j;
                        break;
                    }
                }
                for( j=0; j<=js; j++ ) {
                    if ( mk[i+sh0*(j+sh1*k)] ) {
                        je = j+sh1;
                        break;
                    }
                }
                if (js>=0 && je>=sh1) {
                    xs = x[js];
                    xe = x[je-sh1] + mod;
                    ys = y[i+sh0*(js+sh1*k)];
                    ye = y[i+sh0*(je-sh1+sh1*k)];
                    for( j=js+1; j<je; j++ ) {
                        xc = x[j % sh1] + (j/sh1)*mod;
                        y[i+sh0*(j%sh1+sh1*k)] =
                            ( (xe-xc)*ys + (xc-xs)*ye ) / (xe-xs) ;
                        mk[i+sh0*(j%sh1+sh1*k)] = (u_int8_t) 1;
                    }
                }
            }
        }
    }
    return rary;
}

/**
 * @brief multi-dimensional regrid for based on indices (which is essentially fro equally separated grid)
 * 
 * @details
 * input data is converted into double (NArray's Float) type. So unuable for
 * complex data.
 * 
 * * dims must be an Array containing integers (>=0 & < rank)
 * * lengths of the Array arguments must be the same (rclen),
 *   and the rank of NArrays in scidxs must also be rclen.
 * * shapes of the NArrays in scidxs must be the same
 * 
 * Supports cyclic interpolation. Extrapolation is not supported.
 * 
 * Returned is a two-element Array consisting of the interpolated NArray (rval)
 * and the updated mask (rmask).
 * 
 * @note
 * implemntation strategy: make a consolidate single loop for the return 
 * value
 */
static VALUE
regrid2_w_idx(obj, val, mask, dims, scidxs, cyclics)
     VALUE obj;
     VALUE val; // [NArray] real/int (not complex) NArray with data missing
     VALUE mask; // [NArray (byte)] mask array (shape same as val)
     VALUE dims; // [Array of Integer] dimensions along which regrid is made
     VALUE scidxs; // [Array of NArray] float index vals at which sampled (0..)
     VALUE cyclics; // [Array of true/false or nil] wheather the dim is cyclic.
                    // modulo is assumed to be the dimension length, so
                    // available only when exactly 1-grid extension is needed
{
    VALUE rval, rmask, rary; // result
    VALUE scidx; // (work) one of scidxs
    double *iv, *rv; // input & return values
    double **sci; // sampling index values
    int *cyc;
    double a, c; // subgrid position and coefficient for linear interporation
    u_int8_t *imsk, *rmsk; // input & return masks
    struct NARRAY *na;  // work
    na_shape_t *ish, *rsh, *ssh;   // shapes of input, retrun & sampling values
    na_shape_t *icsh, *rcsh, *scsh; // cumulative shapes
    na_shape_t i, j, k, m, si;
    na_shape_t ii, ii0, dii; // related to 1D indices along input val
    na_shape_t itot, rtot, stot; // total array lengths (input, return, sample)
    int rank, ndims, d, dc, *dms, nd2p, kup;
    int *dimmap;  // to indicate dimensions to be sampled and left

    ndims = RARRAY_LEN(dims);
    if (ndims != RARRAY_LEN(scidxs)) 
        rb_raise(rb_eArgError, "lens of dims & scidx do not agree");
    if (!NIL_P(cyclics) && ndims != RARRAY_LEN(cyclics))
            rb_raise(rb_eArgError, "lens of dims & cyclics do not agree");

    val = na_cast_object(val, NA_DFLOAT);
    rank = NA_RANK(val);
    if (ndims>rank)
        rb_raise(rb_eArgError, "# of dims exceeds val.rank (%d)", rank);
    GetNArray(val, na);
    ish = na->shape;
    iv = NA_PTR_TYPE(val, double *);
    itot = NA_TOTAL(val);
    if (itot != NA_TOTAL(mask))
        rb_raise(rb_eArgError, "lens of val and mask do not agree");
    imsk = NA_PTR_TYPE(mask, u_int8_t *);

    scidx = RARRAY_AREF(scidxs, 0);
    if (NA_RANK(scidx) != ndims)
        rb_raise(rb_eArgError, "rank of sampling indices != # of dims");
    GetNArray(scidx, na);
    ssh = na->shape;
    stot = NA_TOTAL(scidx);
    sci =  ALLOCA_N(double *, ndims);
    for (d=1; d<ndims; d++) {
        if (NA_TOTAL(RARRAY_AREF(scidxs, d)) != stot)
            rb_raise(rb_eArgError, "lens of scidxs are not uniform");
    }
    for (d=0; d<ndims; d++) {
        sci[d] = NA_PTR_TYPE(RARRAY_AREF(scidxs, d), double *);
    }

    dms =  ALLOCA_N(int, ndims);
    for (d=0; d<ndims; d++) {
        dms[d] = NUM2INT(RARRAY_AREF(dims, d));
        if (dms[d] < 0 || dms[d]>=rank)
            rb_raise(rb_eArgError, "values of dims is not within 0...rank(%d)",
                     rank);
    }
    cyc =  ALLOCA_N(int, ndims);
    for (d=0; d<ndims; d++) {
        cyc[d] = !NIL_P(cyclics) && RARRAY_AREF(cyclics, d) == Qtrue;
    }

    dimmap =  ALLOCA_N(int, rank); // e.g. [-1,0,-1,1,-1,-1,..] if dims==[1,3]
    for (d=0; d<rank; d++) {
        dimmap[d] = -1; // initialization (-1 indicates where dimension is left)
    }
    for (d=0; d<ndims; d++) {
        dimmap[dms[d]] = d; // position (0...ndims) of dimension to be sampled
    }
    
    rsh = ALLOCA_N(na_shape_t, rank); 
    for (d=0; d<rank; d++) {
        if (dimmap[d] < 0) {
            rsh[d] = ish[d];
        } else {
            rsh[d] = ssh[dimmap[d]];
        }
    }

    icsh = ALLOCA_N(na_shape_t, rank);  // cumulative shape [1, sh0, sh0*sh1,..]
    rcsh = ALLOCA_N(na_shape_t, rank);  // cumulative shape [1, sh0, sh0*sh1,..]
    scsh = ALLOCA_N(na_shape_t, ndims); // cumulative shape [1, sh0, sh0*sh1,..]
    icsh[0] = rcsh[0] = scsh[0] = 1;
    for (d=1; d<rank; d++) {
        icsh[d] = icsh[d-1]*ish[d-1];
        rcsh[d] = rcsh[d-1]*rsh[d-1];
    }
    for (d=1; d<ndims; d++) {
        scsh[d] = scsh[d-1]*ssh[d-1];
    }

    rval = na_make_object(NA_DFLOAT, rank, rsh, cNArray);
    rv = NA_PTR_TYPE(rval, double *);
    rtot = NA_TOTAL(rval);

    rmask = na_make_object(NA_BYTE, rank, rsh, cNArray);
    rmsk = NA_PTR_TYPE(rmask, u_int8_t *);

    rary = rb_ary_new();
    rb_ary_push(rary, rval);
    rb_ary_push(rary, rmask);

    nd2p = 1 << ndims;  // 2**ndims
    for (j=0; j<rtot; j++) {  // 1-dimensionalized loop for return values
        rmsk[j] = 1;   // (initialization) as all-valid
        si = 0; // (initialization) 1-dimensionalized index of scidx
        ii0 = 0; // (initialization) offset of ii for non-regrided dims
        for (d=0; d<rank; d++) {
            m = (j/rcsh[d]) % rsh[d]; // index along d-th dim of rv
            if (dimmap[d] >= 0) {
                si += m * scsh[dimmap[d]];   // for dim to regrid
            } else {
                ii0 += m * icsh[d];          // for dim not to regrid
            }
        }
        rv[j] = 0.0;  // (initialization) base of summation
        for (i=0; i<nd2p; i++) {  // loop for multi-D linear interpolation
            dii = 0; //(initialization) 1-dimensionalized iv index diff from ii0
            c = 1.0; //(initialization) fractional factor to sum up
            for (d=0; d<rank; d++) {
                dc = dimmap[d];  // if >0, dc-th sci is treated (dc: 0,1,..)
                if (dc >= 0) {
                    k = floor(sci[dc][si]); // index along d-th dim of iv
                    a = sci[dc][si] - k;  // subgrid fractional distance
                    kup = (i >> dc) % 2; // 0(1) to use the this(next) grid pt
                                         // (note: i>>d is equal to i/2**d)
                    k += kup;
                    if (k<0 || k>=ish[d]) {
                        if (cyc[d]) {
                            k = k % ish[d];
                            if (k<0) k += ish[d]; // for negative mod of C lang
                        } else {
                            rmsk[j] = 0;  // do not extrapolate
                            break;
                        }
                    }
                    dii += k * icsh[d];
                    c *= ( kup ? a : 1.0-a );
                }
            }
            if (rmsk[j] == 0) {
                break;  // have been set in the d loop --> break the i loop
            }
            ii = ii0 + dii;
            if ( imsk[ii] ) {
                rv[j] += c*iv[ii];
            } else {
                rmsk[j] = 0;
                break;
            }
        }
    }
    return rary;
}

void
init_gphys_interpo()
{
    static VALUE mNumRu;
    static VALUE cGPhys;

    // rb_require("narray");  // it does not work
    mNumRu = rb_define_module("NumRu");
    cGPhys = rb_define_class_under(mNumRu, "GPhys", rb_cObject);
    rb_define_private_method(cGPhys, "c_interpo_find_loc_1D", interpo_find_loc_1D, 4);
    rb_define_private_method(cGPhys, "c_interpo_find_loc_1D_MD", interpo_find_loc_1D_MD, 5);
    rb_define_private_method(cGPhys, "c_interpo_do", interpo_do, 5);
    rb_define_private_method(cGPhys, "c_interpo_missing", interpo_missing, 6);
    rb_define_private_method(cGPhys, "c_regrid2_w_idx", regrid2_w_idx, 5);

    // to make "find loc" methods available outside GPhys as class methods
    rb_define_singleton_method(cGPhys, "interpo_find_loc_1D", interpo_find_loc_1D, 4);
    rb_define_singleton_method(cGPhys, "interpo_find_loc_1D_MD", interpo_find_loc_1D_MD, 5);
    rb_define_singleton_method(cGPhys, "na_interpo_missing", interpo_missing, 6);
}
