             AREA |C$$code|,CODE,READONLY

             GET "Hdr.Common"

; The square root function is trivial, because VFP calculates
; square roots in hardware using the vsqrt instruction.

             EXPORT vfp_sqrt
vfp_sqrt     p1arg
             vsqrt.F64 d0,d0
             pres
             mov pc,r14

; This calculates the hypotinuse of a triangle with sides of
; length x and y. However, we can't compute sqrt(x^2 + y^2)
; because of possible rounding errors.

             EXPORT vfp_hypot
vfp_hypot    p2arg                   ; d1=y
             p1arg                   ; d0=x
             vsub.F64 d2,d0,d1
             vcmp.F64 d2,d1
             vmrs apsr_nzcv,fpscr
             ble %FT10
             vmov.F64 r1,r2,d0       ; case where x>2y
             mov r1,#0
             vmov.F64 d2,r1,r2
             vmul.F64 d3,d2,d2
             vmul.F64 d1,d1,d1
             vsub.F64 d4,d0,d2
             vadd.F64 d0,d0,d2       ; x+x1
             vmul.F64 d0,d0,d4       ; x2*(x+x1)
             vadd.F64 d0,d1,d0
             vmul.F64 d2,d2,d2
             vadd.F64 d0,d2,d0
             vsqrt.F64 d0,d0
             pres
             mov pc,r14
10           vadd.F64 d0,d0,d0       ; case where x<=2y
             vmov.F64 r1,r2,d0
             mov r1,#0
             vmov.F64 d3,r1,r2       ; d3=high bits of 2x
             vsub.F64 d0,d0,d3
             vmul.F64 d0,d0,d1       ; t2*y
             vmov.F64 r1,r2,d1
             mov r1,#0
             vmov.F64 d4,r1,r2       ; d4=high bits of y
             vsub.F64 d1,d1,d4
             vmul.F64 d1,d3,d1       ; t1*y2
             vadd.F64 d0,d1,d0       ; t1*y2 + t2*y
             vmul.F64 d2,d2,d2
             vadd.F64 d0,d2,d0
             vmul.F64 d1,d3,d4
             vadd.F64 d0,d1,d0
             vsqrt.F64 d0,d0
             pres
             mov pc,r14

; The cube root is a lot more complicated than the square root.
; The basic principle is to find an approximate value from a
; polynomial approximation, and refine it using newton's method.

             EXPORT vfp_cbrt
vfp_cbrt     p1arg
             getfexp r1,d1
             clz r2,r1               ; Divide exponent by 3
             rsbs r2,r2,#30
             movmi r2,#0
             mov r3,#3
             mov r3,r3,lsl r2
             mov r2,#0
10           cmp r1,r3
             subcs r1,r1,r3
             adc r2,r2,r2
             mov r3,r3,lsr #1
             cmp r3,#3
             bhs %BT10

; at this point r2 holds the exponent divided by 3 and r1 the remainder
; d0 the original argument and d1 the fraction between 0.5 and 1

             add r2,r2,#1024
             sub r2,r2,#1
             mov r2,r2,lsl #20
             mov r3,#0
             vmov.F64 d2,r3,r2       ; 2^(e/3)
             adr r2,cbscale
             add r2,r2,r1,lsl #3
             vldr.64 d3,[r2]

; now d2 contains the cube root of the exponent and d3 a scale term
; selected on the basis of the remainder

             adr r2,cbrcoeff
             add r3,r2,#40
             vldr.64 d6,[r2],#8       ; d6=sum
20           vldr.64 d4,[r2],#8
             vmla.F64 d4,d6,d1        ; sum=sum*y + term
             vmov.F64 d6,d4
             cmp r2,r3
             blt %BT20
             vmul.F64 d3,d6,d3
             vmul.F64 d2,d3,d2        ; d2 is an approximate cube root

; d2 contains an approximation to the cube root accurate to ~5 s.f.
; so the final stage is to refine the root using Newton's method

             vmov.F64 d4,#3.0
             mov r2,#3
30           vmul.F64 d1,d2,d2
             vmul.F64 d3,d1,d2
             vmul.F64 d1,d1,d4        ; 3x^2
             vsub.F64 d3,d3,d0        ; x^3 - c
             vdiv.F64 d1,d3,d1
             vsub.F64 d2,d2,d1
             subs r2,r2,#1
             bne %BT30
             vmov.F64 d0,d2
             pres
             mov pc,r14

cbscale      DCFD 1.0
             DCFD 1.2599210498948731647672
             DCFD 1.5874010519681994747517

cbrcoeff     DCFD -1.3466110473359520655053E-1
             DCFD 5.4664601366395524503440E-1
             DCFD -9.5438224771509446525043E-1
             DCFD 1.1399983354717293273738E0
             DCFD 4.0238979564544752126924E-1

             END
