/* Copyright 2007 Brendan Halpin brendan.halpin@ul.ie
Distribution is permitted under the terms of the GNU General Public Licence
*/
#include "stplugin.h"
#include
#include
#include
#include
#include
#define NSTATES 50
#define MAXLENGTH 100
double workspace[MAXLENGTH+1][MAXLENGTH+1][4];
double subsmat[NSTATES][NSTATES];
double indelcost;
int adjdur;
int show_workspace;
double exponent;
/* Extension of oma to treat indels (as deletions) and
substitutions differently where the action constitutes a
deletion of one of a sequence of observations in the same state.
An indel can be viewed as an insertion in one sequence or a
deletion in the other. Here we treat it as a deletion and look
at the relevant sequence for repeated observations.
A substitution is equivalently
(i) a deletion of s1[i] in s1 plus an insertion of s2[j] in s1,
(ii) a deletion of s2[j] in s2 with an insertion of s1[i]
(iii) a deletion in both s1 and s2 and
(iv) an insertion in both s1 and s2.
We can therefore think of substitutions as simultaneous
deletions, with a lower cost because of their simultaneity.
If there are no contiguous runs, then the total cost is
subs[s1[i],s2[j]], the conventional pair-specific substitution
cost, with a ceiling of 2*indelcost (i.e. if the substution cost
is high it will be cheaper to do a conventional double indel).
Where either s1[i] or s2[j] is part of a contiguous run, we can
reduce the substitution cost accordingly.
In this implementation the substitution cost is reduced by the
geometric mean of the reduction factor for s1[i] and s2[j]. That
is, if s1[i] is unique, and s2[j] is part of a run of two, the
divisor is sqrt(1*sqrt(2)); if s1[i] is one of three and s2[j] one
of two, it is sqrt(sqrt(3)*sqrt(2)).
*/
double omav2(int *s1, int *s2, int length1, int length2, int adjdur, int show_workspace) {
int i, j;
int rs1[MAXLENGTH];
int rs2[MAXLENGTH];
double A[length1+1];
double B[length2+1];
double C[length1+1][length2+1];
double D[length1+1][length2+1];
double denom;
char buf[80];
adj_rep(s1,rs1,length1);
adj_rep(s2,rs2,length2);
A[0]=0;
B[0]=0;
for (i=1;i<=length1;i++){
if (adjdur) {
denom = (float)rs1[i-1];
A[i]=indelcost/pow(denom,exponent);
} else {
A[i]=indelcost;
}
}
for (i=1;i<=length2;i++){
if (adjdur) {
denom = (float)rs2[i-1];
B[i]=indelcost/pow(denom,exponent);
} else {
B[i]=indelcost;
}
}
C[0][0]=0.0;
D[0][0]=0.0;
for (i = 1; i<= length1; i++ ) {
C[i][0]=0.0;
for (j = 1; j<= length2; j++ ) {
denom = (float)A[i];
/* Substitution cost is scaled according to the longer of the two runs Apr 21 2007 17:45:13 */
if (A[i]>B[j]) {
denom = (float)B[j];
}
C[i][j] = subsmat[s1[i-1]-1][s2[j-1]-1]*denom/indelcost; /* indelcost comes in because it is already in definition of A and B */
}
}
for (j = 1; j<= length2; j++ ) {
C[0][j]=0.0;
}
for (i = 1; i<= length1; i++ ) {
D[i][0] = D[i - 1][0] + A[i];
}
for (j = 1; j<= length2; j++ ) {
D[0][j] = D[0][j - 1] + B[j];
}
for (i = 1; i<= length1; i++ ) {
for (j = 1; j<= length2; j++ ) {
D[i][j] = D[i-1][j ] + A[i];
if (D[i][j]>D[i ][j-1] + B[j]) {
D[i][j] = D[i ][j-1] + B[j];
}
if (D[i][j]>D[i-1][j-1] + C[i][j]) {
D[i][j] = D[i-1][j-1] + C[i][j];
}
}
}
if (show_workspace==1) {
snprintf(buf, 80, "Seq 1: ");
SF_display(buf);
for (i=0;ilength2 ? length1 : length2));
}
int adj_rep(int *seq, int *rseq, int length) {
int start, count, i,j;
i = 0;
while (i= length) break;
}
for (j = start; j=1);
assert(s1[i-1]<=NSTATES);
}
for (i=1; i<=length2; i++) {
SF_vdata(i+2, second, &x2);
s2[i-1]=(int)x2;
assert(s2[i-1]>=1);
assert(s2[i-1]<=NSTATES);
}
return(omav2(s1,s2,length1,length2,adjdur,show_workspace));
}
/* Variables passed from Stata are: */
/* id length x1-xn */
STDLL stata_call(int argc, char *argv[])
{
ST_int nobs,nvars;
int i,j;
double distance;
ST_double z;
char buf[80] ;
FILE *outputfile;
char *outputfilename;
/* Fourth argument is whether to adjust for duration */
adjdur = atoi(argv[3]);
/* Optional fifth argument is whether to show workspace */
show_workspace = 0;
if (argc>3) {
show_workspace = atoi(argv[4]);
}
exponent = 0.5;
if (argc>4) {
exponent = (float)atof(argv[5]);
}
/* Read in the substitution matrix */
for (i=0;i