blob: c504a95efa87e10e2ed0394d47139d680d8369bd [file] [log] [blame]
c Code in this file based on Applied Statistics algorithms
c (C) Royal Statistical Society 1979
c
C a minimal modification of AS136 to use double precision
C all variables are now declared.
C B.D. Ripley 1998/06/17
C Declaration re-ordering to satisfy "f77 -ansi", M.Maechler 2001/04/12
C
c ~= R's kmeans(x=A, centers=C, iter.max=ITER, algorithm = "Hartigan-Wong")
C
SUBROUTINE KMNS(A, M, N, C, K, IC1, IC2, NC, AN1, AN2, NCP, D,
* ITRAN, LIVE, ITER, WSS, IFAULT)
C
C ALGORITHM AS 136 APPL. STATIST. (1979) VOL.28, NO.1
C
C Divide M points in N-dimensional space into K clusters so that
C the within cluster sum of squares is minimized.
C
INTEGER M,N,K,ITER,IFAULT
INTEGER IC1(M), IC2(M), NC(K), NCP(K), ITRAN(K+1), LIVE(K)
DOUBLE PRECISION A(M,N), D(M), C(K,N), AN1(K), AN2(K), WSS(K)
c ------ ------
c data x[,] centers[,]
DOUBLE PRECISION DT(2), ZERO, ONE
INTEGER I,IL,J,L,INDX,IJ,II, iTrace, iMaxQtr
DOUBLE PRECISION BIG, DA, TEMP, DB, DC,AA
C
C Define BIG to be a very large positive number
C
DATA BIG /1.E30/, ZERO /0.0/, ONE /1.0/
C
iTrace = IFAULT
iMaxQtr = ITRAN(1)
IFAULT = 3
IF (K .LE. 1 .OR. K .GE. M) RETURN
IFAULT = 0
C
C For each point I, find its two closest centres, IC1(I) and
C IC2(I). Assign it to IC1(I).
C
DO 60 I = 1, M
IC1(I) = 1
IC2(I) = 2
DO IL = 1, 2
DT(IL) = ZERO
DO J = 1, N
DA = A(I,J) - C(IL,J)
DT(IL) = DT(IL) + DA*DA
end DO
end DO ! IL
IF (DT(1) .GT. DT(2)) THEN
IC1(I) = 2
IC2(I) = 1
TEMP = DT(1)
DT(1) = DT(2)
DT(2) = TEMP
END IF
DO 50 L = 3, K
DB = ZERO
DO J = 1, N
DC = A(I,J) - C(L,J)
DB = DB + DC*DC
IF (DB .GE. DT(2)) GO TO 50
end DO
IF (DB .ge. DT(1)) then
DT(2) = DB
IC2(I) = L
else
DT(2) = DT(1)
IC2(I) = IC1(I)
DT(1) = DB
IC1(I) = L
end IF
50 CONTINUE
60 CONTINUE
C
C Update cluster centres to be the average of points contained
C within them.
C NC(L) := #{units in cluster L}, L = 1..K
DO L = 1, K
NC(L) = 0
DO J = 1, N
C(L,J) = ZERO
end DO
end DO
DO I = 1, M
L = IC1(I)
NC(L) = NC(L) + 1
DO J = 1, N
C(L,J) = C(L,J) + A(I,J)
end DO
end DO
C
C Check to see if there is any empty cluster at this stage
C
DO L = 1, K
IF (NC(L) .EQ. 0) THEN
IFAULT = 1
RETURN
END IF
AA = NC(L)
DO J = 1, N
C(L,J) = C(L,J) / AA
end DO
C
C Initialize AN1, AN2, ITRAN & NCP
C AN1(L) = NC(L) / (NC(L) - 1)
C AN2(L) = NC(L) / (NC(L) + 1)
C ITRAN(L) = 1 if cluster L is updated in the quick-transfer stage,
C = 0 otherwise
C In the optimal-transfer stage, NCP(L) stores the step at which
C cluster L is last updated.
C In the quick-transfer stage, NCP(L) stores the step at which
C cluster L is last updated plus M.
C
AN2(L) = AA / (AA + ONE)
AN1(L) = BIG
IF (AA .GT. ONE) AN1(L) = AA / (AA - ONE)
ITRAN(L) = 1
NCP(L) = -1
end DO
INDX = 0
DO IJ = 1, ITER
C
C OPtimal-TRAnsfer stage: there is only one pass through the data.
C Each point is re-allocated, if necessary, to the cluster that will
C induce the maximum reduction in within-cluster sum of squares.
C
CALL OPTRA(A, M, N, C, K, IC1, IC2, NC, AN1, AN2, NCP, D,
* ITRAN, LIVE, INDX)
if(iTrace .gt. 0) call kmns1(K, IJ, INDX)
C
C Stop if no transfer took place in the last M optimal transfer steps.
IF (INDX .EQ. M) GO TO 150
C
C Quick-TRANSfer stage: Each point is tested in turn to see if it should
C be re-allocated to the cluster to which it is most likely to be
C transferred, IC2(I), from its present cluster, IC1(I).
C Loop through the data until no further change is to take place.
C
CALL QTRAN(A, M, N, C, K, IC1, IC2, NC, AN1, AN2, NCP, D,
* ITRAN, INDX, iTrace, iMaxQtr)
C
if(iMaxQtr .lt. 0) then
IFAULT = 4
GO TO 150
end if
C
C If there are only two clusters, there is no need to re-enter the
C optimal transfer stage.
C
IF (K .EQ. 2) GO TO 150
C
C NCP has to be set to 0 before entering OPTRA.
C
DO L = 1, K
NCP(L) = 0
end DO
end DO ! iter --------------------------------------
C
C Since the specified number of iterations has been exceeded, set
C IFAULT = 2. This may indicate unforeseen looping.
C
IFAULT = 2
150 continue
ITER = IJ
C
C Compute within-cluster sum of squares for each cluster.
C
do L = 1, K
WSS(L) = ZERO
do J = 1, N
C(L,J) = ZERO
end do
end do
do I = 1, M
II = IC1(I)
do J = 1, N
C(II,J) = C(II,J) + A(I,J)
end do
end do
do J = 1, N
do L = 1, K
C(L,J) = C(L,J) / DBLE(NC(L))
end do
do I = 1, M
II = IC1(I)
DA = A(I,J) - C(II,J)
WSS(II) = WSS(II) + DA*DA
end do
end do
C
RETURN
END
C
C
SUBROUTINE OPTRA(A, M, N, C, K, IC1, IC2, NC, AN1, AN2, NCP, D,
* ITRAN, LIVE, INDX)
C
C ALGORITHM AS 136.1 APPL. STATIST. (1979) VOL.28, NO.1
C
C This is the OPtimal TRAnsfer stage.
C ----------------------
C Each point is re-allocated, if necessary, to the cluster that
C will induce a maximum reduction in the within-cluster sum of
C squares.
C
INTEGER M,N,K,INDX
INTEGER IC1(M), IC2(M), NC(K), NCP(K), ITRAN(K), LIVE(K)
DOUBLE PRECISION A(M,N), D(M), C(K,N), AN1(K), AN2(K)
INTEGER L,I,L1,L2,LL,J
DOUBLE PRECISION ZERO, ONE
DOUBLE PRECISION BIG,DE,DF,DA,DB,R2,RR,DC,DD,AL1,ALW,AL2,ALT
C
DATA BIG /1.0E30/, ! BIG := a very large positive number
+ ZERO /0.0/, ONE/1.0/
C
C If cluster L is updated in the last quick-transfer stage, it
C belongs to the live set throughout this stage. Otherwise, at
C each step, it is not in the live set if it has not been updated
C in the last M optimal transfer steps.
C
do L = 1, K
IF (ITRAN(L) .EQ. 1) LIVE(L) = M + 1
end do
C ---------------------- Loop over each point -------------------
DO I = 1, M
INDX = INDX + 1
L1 = IC1(I)
L2 = IC2(I)
LL = L2
C
C If point I is the only member of cluster L1, no transfer.
C
IF (NC(L1) .EQ. 1) GO TO 90
C
C If L1 has not yet been updated in this stage, no need to
C re-compute D(I).
C
IF (NCP(L1) .ne. 0) then
DE = ZERO
do J = 1, N
DF = A(I,J) - C(L1,J)
DE = DE + DF*DF
end do
D(I) = DE * AN1(L1)
END IF
C
C Find the cluster with minimum R2.
C
DA = ZERO
do J = 1, N
DB = A(I,J) - C(L2,J)
DA = DA + DB*DB
end do
R2 = DA * AN2(L2)
DO 60 L = 1, K
C
C If I >= LIVE(L1), then L1 is not in the live set. If this is
C true, we only need to consider clusters that are in the live set
C for possible transfer of point I. Otherwise, we need to consider
C all possible clusters.
C
IF (I .GE. LIVE(L1) .AND. I .GE. LIVE(L) .OR. L .EQ. L1 .OR.
* L .EQ. LL) GO TO 60
RR = R2 / AN2(L)
DC = ZERO
do J = 1, N
DD = A(I,J) - C(L,J)
DC = DC + DD*DD
IF (DC .GE. RR) GO TO 60
end do
R2 = DC * AN2(L)
L2 = L
60 CONTINUE
IF (R2 .ge. D(I)) then
C If no transfer is necessary, L2 is the new IC2(I).
IC2(I) = L2
ELSE
C
C Update cluster centres, LIVE, NCP, AN1 & AN2 for clusters L1 and
C L2, and update IC1(I) & IC2(I).
INDX = 0
LIVE(L1) = M + I
LIVE(L2) = M + I
NCP(L1) = I
NCP(L2) = I
AL1 = NC(L1)
ALW = AL1 - ONE
AL2 = NC(L2)
ALT = AL2 + ONE
do J = 1, N
C(L1,J) = (C(L1,J) * AL1 - A(I,J)) / ALW
C(L2,J) = (C(L2,J) * AL2 + A(I,J)) / ALT
end do
NC(L1) = NC(L1) - 1
NC(L2) = NC(L2) + 1
AN2(L1) = ALW / AL1
AN1(L1) = BIG
IF (ALW .GT. ONE) AN1(L1) = ALW / (ALW - ONE)
AN1(L2) = ALT / AL2
AN2(L2) = ALT / (ALT + ONE)
IC1(I) = L2
IC2(I) = L1
END IF
90 CONTINUE
IF (INDX .EQ. M) RETURN
end do
C ---------------------- each point -------------------
do L = 1, K
ITRAN(L) = 0 !before entering QTRAN.
c LIVE(L) has to be decreased by M before re-entering OPTRA
LIVE(L) = LIVE(L) - M
end do
C
RETURN
END
C
C
SUBROUTINE QTRAN(A, M, N, C, K, IC1, IC2, NC, AN1, AN2, NCP, D,
* ITRAN, INDX, iTrace, iMaxQtr)
C
C ALGORITHM AS 136.2 APPL. STATIST. (1979) VOL.28, NO.1
C
C This is the Quick TRANsfer stage.
c --------------------
C IC1(I) is the cluster which point I belongs to.
C IC2(I) is the cluster which point I is most likely to be
C transferred to.
C For each point I, IC1(I) & IC2(I) are switched, if necessary, to
C reduce within-cluster sum of squares. The cluster centres are
C updated after each step.
C
INTEGER M,N,K,INDX, iTrace
INTEGER IC1(M), IC2(M), NC(K), NCP(K), ITRAN(K)
DOUBLE PRECISION A(M,N), D(M), C(K,N), AN1(K), AN2(K)
DOUBLE PRECISION ZERO, ONE
INTEGER ICOUN,ISTEP,I,L1,L2,J
DOUBLE PRECISION BIG,DA,DB,DD,AL1,ALW,AL2,ALT,R2,DE
external kmnsQpr
C
C Define BIG to be a very large positive number
C
DATA BIG /1.0E30/, ZERO /0.0/, ONE /1.0/
C
C In the quick transfer stage, NCP(L)
C is equal to the step at which cluster L is last updated plus M.
C
ICOUN = 0
ISTEP = 0
c Repeat {
10 continue
DO I = 1, M
if(iTrace .gt. 0 .and. ISTEP .ge. 1 .and. I .eq. 1) ! only from second "round" on
+ call kmnsQpr(ISTEP, ICOUN, NCP, K, iTrace)
ICOUN = ICOUN + 1
ISTEP = ISTEP + 1
IF(ISTEP .ge. iMaxQtr) THEN
iMaxQtr = -1
RETURN
ENDIF
L1 = IC1(I)
L2 = IC2(I)
C
C If point I is the only member of cluster L1, no transfer.
C
IF (NC(L1) .EQ. 1) GO TO 60
C
C If ISTEP > NCP(L1), no need to re-compute distance from point I to
C cluster L1. Note that if cluster L1 is last updated exactly M
C steps ago, we still need to compute the distance from point I to
C cluster L1.
C
IF (ISTEP .le. NCP(L1)) then
DA = ZERO
DO J = 1, N
DB = A(I,J) - C(L1,J)
DA = DA + DB*DB
end DO
D(I) = DA * AN1(L1)
end IF
C
C If ISTEP >= both NCP(L1) & NCP(L2) there will be no transfer of
C point I at this step.
C
IF (ISTEP .lt. NCP(L1) .or. ISTEP .lt. NCP(L2)) then
R2 = D(I) / AN2(L2)
DD = ZERO
DO J = 1, N
DE = A(I,J) - C(L2,J)
DD = DD + DE*DE
IF (DD .GE. R2) GO TO 60
end DO
C
C Update cluster centres, NCP, NC, ITRAN, AN1 & AN2 for clusters
C L1 & L2. Also update IC1(I) & IC2(I). Note that if any
C updating occurs in this stage, INDX is set back to 0.
C
ICOUN = 0
INDX = 0
ITRAN(L1) = 1
ITRAN(L2) = 1
NCP(L1) = ISTEP + M
NCP(L2) = ISTEP + M
AL1 = NC(L1)
ALW = AL1 - ONE
AL2 = NC(L2)
ALT = AL2 + ONE
DO J = 1, N
C(L1,J) = (C(L1,J) * AL1 - A(I,J)) / ALW
C(L2,J) = (C(L2,J) * AL2 + A(I,J)) / ALT
end DO
NC(L1) = NC(L1) - 1
NC(L2) = NC(L2) + 1
AN2(L1) = ALW / AL1
AN1(L1) = BIG
IF (ALW .GT. ONE) AN1(L1) = ALW / (ALW - ONE)
AN1(L2) = ALT / AL2
AN2(L2) = ALT / (ALT + ONE)
IC1(I) = L2
IC2(I) = L1
end if
C
C If no re-allocation took place in the last M steps, return.
C
60 IF (ICOUN .EQ. M) RETURN
end do
call rchkusr() ! allow user interrupt
GO TO 10
c --------
END