<?xml version="1.0" encoding="utf-8" standalone="no"?>
<!DOCTYPE document PUBLIC "-//CNX//DTD CNXML 0.5 plus MathML//EN" "http://cnx.rice.edu/cnxml/0.5/DTD/cnxml_mathml.dtd">
<document xmlns="http://cnx.rice.edu/cnxml" xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="Module.2004-05-18.4001">
  <name xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Efficient FFT Algorithm and Programming Tricks</name>
  <metadata xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">
  <md:version xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">1.6</md:version>
  <md:created xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">2004/05/18 10:40:02 GMT-5</md:created>
  <md:revised xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">2007/02/24 12:15:20.331 US/Central</md:revised>
  <md:authorlist xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">
      <md:author xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="dljones">
      <md:firstname xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Douglas</md:firstname>
      <md:othername xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">L.</md:othername>
      <md:surname xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Jones</md:surname>
      <md:email xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">dl-jones@uiuc.edu</md:email>
    </md:author>
  </md:authorlist>

  <md:maintainerlist xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">
    <md:maintainer xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="dljones">
      <md:firstname xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Douglas</md:firstname>
      <md:othername xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">L.</md:othername>
      <md:surname xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Jones</md:surname>
      <md:email xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">dl-jones@uiuc.edu</md:email>
    </md:maintainer>
    <md:maintainer xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="harika">
      <md:firstname xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Harika</md:firstname>
      
      <md:surname xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Basana</md:surname>
      <md:email xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">ilsai@rice.edu</md:email>
    </md:maintainer>
    <md:maintainer xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="kclarks">
      <md:firstname xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Kyle</md:firstname>
      <md:othername xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Evan</md:othername>
      <md:surname xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Clarkson</md:surname>
      <md:email xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">kclarks@rice.edu</md:email>
    </md:maintainer>
  </md:maintainerlist>
  
  <md:keywordlist xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">
    <md:keyword xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">assembly language</md:keyword>
    <md:keyword xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">bit reverse</md:keyword>
    <md:keyword xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">compiler</md:keyword>
    <md:keyword xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">DFT</md:keyword>
    <md:keyword xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">FFT</md:keyword>
    <md:keyword xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">real FFT</md:keyword>
    <md:keyword xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">table lookup</md:keyword>
  </md:keywordlist>

  <md:abstract xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Many tricks and techniques have been developed to speed up the computation of FFTs.
Significant reductions in computation time result from table lookup of twiddle factors, compiler-friendly or assembly-language programming, special hardware, and FFT algorithms for real-valued data.
Higher-radix algorithms, fast bit-reversal, and special butterflies yield more modest but worthwhile savings.</md:abstract>
</metadata>

  <content xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">
  <para xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="element-328">The use of <cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12026">FFT algorithms</cnxn> such as the
<cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12016">radix-2 decimation-in-time</cnxn> or
<cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12018">decimation-in-frequency</cnxn> methods
result in tremendous savings in computations when computing the
<cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12019">discrete Fourier transform</cnxn>.
While most of the speed-up of FFTs comes from this,
careful implementation can provide additional savings ranging
from a few percent to several-fold increases in program speed.</para><section xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">
  <name xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Precompute twiddle factors</name>
      <para xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="p1">The <cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12016">twiddle factor</cnxn>, or
<m:math>
  <m:apply>
    <m:eq/>
      <m:ci>
	<m:msubsup>
	  <m:mi>W</m:mi>
	  <m:mi>N</m:mi>
	  <m:mi>k</m:mi>
	</m:msubsup>
      </m:ci>
      <m:apply>
	<m:exp/>
	  <m:apply>
	    <m:minus/>
              <m:apply>
	        <m:times/>
		  <m:imaginaryi/>
		    <m:apply>
		      <m:divide/>
		       <m:apply>
			<m:times/>
			<m:cn>2</m:cn>
			<m:pi/>
			<m:ci>k</m:ci>
		       </m:apply>
		     <m:ci>N</m:ci>
		    </m:apply>
		  </m:apply>
		</m:apply>
	      </m:apply>
        </m:apply>
	</m:math>,
 terms
        that multiply the intermediate data in the <cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12026">FFT algorithms</cnxn> consist of cosines and sines
        that each take the equivalent of several multiplies to compute.
        However, at most <m:math><m:ci>N</m:ci></m:math> unique
        twiddle factors can appear in any FFT or DFT algorithm.
        (For example, in the <cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12016">radix-2 decimation-in-time FFT</cnxn>,
        only
        <m:math>
          <m:apply>
          <m:divide/>
            <m:ci>N</m:ci>
            <m:ci>2</m:ci>
          </m:apply>
        </m:math>
        twiddle factors
	<m:math>
	  <m:apply>
	    <m:forall/>
	    <m:bvar>
	      <m:ci>k</m:ci>
	    </m:bvar>
	    <m:condition>
	      <m:apply>
		<m:eq/>
		<m:ci>k</m:ci>
		<m:set>
		  <m:cn>0</m:cn>
		  <m:cn>1</m:cn>
		  <m:cn>2</m:cn>
		  <m:ci>…</m:ci>
		  <m:apply>
		    <m:minus/>
		    <m:apply>
		      <m:divide/>
		      <m:ci>N</m:ci>
		      <m:cn>2</m:cn>
		    </m:apply>
		    <m:cn>1</m:cn>
		  </m:apply>
		</m:set>
	      </m:apply>
	    </m:condition>
	    <m:apply>
	      <m:power/>
	      <m:ci><m:msub>
		  <m:mi>W</m:mi>
		  <m:mi>N</m:mi>
		</m:msub></m:ci>
	      <m:ci>k</m:ci>
	    </m:apply>
	  </m:apply>
	</m:math>
        are used.)
        These twiddle factors can be precomputed once and stored
        in an array in computer memory, and accessed in the FFT
        algorithm by <term xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">table lookup</term>.
        This simple technique yields very substantial savings and
        is almost always used in practice.</para>
    </section>
    <section xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">
    <name xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Compiler-friendly programming</name>
      <para xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="p3">On most computers, only some of the total computation time
        of an FFT is spent performing the FFT butterfly computations;
        determining indices, loading and storing data, computing
        loop parameters and other operations consume the majority
        of cycles.
        Careful programming that allows the compiler to generate
        efficient code can make a several-fold improvement in the
        run-time of an FFT.
        The best choice of radix in terms of program speed may depend more on characteristics
        of the hardware (such as the number of CPU registers) or
        compiler than on the exact number of computations.
        Very often the manufacturer's library codes are carefully
        crafted by experts who know intimately both the hardware
        and compiler architecture and how to get the most performance
        out of them, so use of well-written FFT libraries is
        generally recommended.
        Certain freely available programs and libraries are also
        very good.
        Perhaps the best current general-purpose library is the
        <link xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" src="http://www.fftw.org">FFTW</link> package;
        information can be found at
        <link xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" src="http://www.fftw.org">http://www.fftw.org</link>.
        A paper by <cite xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" src="#Frigo">Frigo and Johnson</cite> describes many of the key issues in developing
        compiler-friendly code.
       </para>
       </section>
       <section xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">
<name xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Program in assembly language</name>
<para xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="element-551">While compilers continue to improve, FFT programs written directly
in the assembly language of a specific machine are often
several times faster than the best compiled code.
This is particularly true for DSP microprocessors, which have
special instructions for accelerating FFTs that compilers don't use.
(I have myself seen differences of up to 26 to 1 in favor of assembly!)
Very often, FFTs in the manufacturer's or high-performance third-party libraries
are hand-coded in assembly.
For DSP microprocessors, the codes developed by
<cite xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" src="#Meyer">Meyer, Schuessler, and Schwarz</cite>
are perhaps the best ever developed;
while the particular processors are now obsolete, the techniques
remain equally relevant today.
Most DSP processors provide special instructions and a
hardware design favoring the radix-2 decimation-in-time algorithm,
which is thus generally fastest on these machines.</para>
    </section>
    <section xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">
    <name xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Special hardware</name>
      <para xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="p4">Some processors have special hardware accelerators or co-processors
      specifically designed to accelerate FFT computations.
      For example, <link xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" src="http://www.amis.com">AMI Semiconductor's</link> <link xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" src="http://www.amis.com/products/dsp/toccata_plus.html">Toccata</link> ultra-low-power DSP microprocessor family,
      which is widely used in digital hearing aids, have on-chip FFT accelerators; it is always faster and more power-efficient to use such accelerators and whatever radix they prefer.</para><para xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="element-535">In a surprising number of applications, almost all of the computations
     are FFTs.
     A number of special-purpose chips are designed to specifically
     compute FFTs, and are used in specialized high-performance
     applications such as radar systems.
     Other systems, such as <link xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" src="http://en.wikipedia.org/wiki/OFDM">OFDM</link>-based
     communications receivers, have special FFT hardware built
     into the digital receiver circuit.
     Such hardware can run many times faster, with much less power
     consumption, than FFT programs on general-purpose processors.</para>
    </section>
    <section xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">
    <name xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Effective memory management</name>
      <para xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="p7">Cache misses or excessive data movement between registers and memory
        can greatly slow down an FFT computation.
        Efficient programs such as the <link xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" src="http://www.fftw.org">FFTW package</link>
        are carefully designed to minimize these inefficiences.
        <cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12016">In-place algorithms</cnxn> reuse the data
        memory throughout the transform, which can reduce cache misses for
        longer lengths.
      </para>
    </section>
    <section xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">
    <name xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Real-valued FFTs</name>
      <para xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="p9">FFTs of real-valued signals require only half as many computations as with complex-valued data.
    There are several methods for reducing the computation,
    which are described in more detail in
        <cite xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" src="#Sorensen2">Sorensen et al.</cite>
    <list xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="list6" type="enumerated"><item xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Use <cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12019">DFT symmetry properties</cnxn> to do two real-valued DFTs at once with one FFT program</item>

      <item xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Perform one stage of the <cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12016">radix-2 decimation-in-time</cnxn> decomposition
        and compute the two length-<m:math>
          <m:apply>
          <m:divide/>
            <m:ci>N</m:ci>
            <m:cn>2</m:cn>
          </m:apply>
         </m:math>
         DFTs using the above approach.
      </item>
      <item xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Use a direct  real-valued FFT algorithm; see <cite xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" src="#Sorensen2">H.V. Sorensen
      et.al.</cite>
      </item>

    </list>
      </para>
    </section>
    <section xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">
    <name xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Special cases</name>
      <para xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="p10">Occasionally only certain DFT frequencies are needed,
        the input signal values are mostly zero, the signal
        is real-valued (as discussed above), or other special
        conditions exist for which faster algorithms can be
        developed.
        <cite xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" src="#Sorensen3">Sorensen and Burrus</cite>
        describe slightly faster algorithms for <link xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" src="http://www.fftw.org/pruned.html">pruned</link>
        or <cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12032" target="zeropad">zero-padded</cnxn> data.
        <cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12024">Goertzel's algorithm</cnxn> is useful
        when only a few DFT outputs are needed.
        The <cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12029">running FFT</cnxn> can be faster
        when DFTs of highly overlapped blocks of data are needed,
        as in a <cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m10570">spectrogram</cnxn>.
</para>
    </section>
    <section xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">
    <name xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Higher-radix algorithms</name>
      <para xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="p12">Higher-radix algorithms, such as the <cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12027">radix-4</cnxn>, radix-8, or <cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12031">split-radix</cnxn> FFTs,
        require fewer computations and can produce modest but worthwhile savings.
        Even the <cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12031">split-radix FFT</cnxn>
        reduces the multiplications by only 33% and the additions
        by a much lesser amount relative to the <cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12016">radix-2 FFTs</cnxn>;
        significant improvements in program speed are often
        due to implicit <link xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" src="http://en.wikipedia.org/wiki/Loop_unrolling">loop-unrolling</link> or other compiler benefits
        than from the computational reduction itself!
      </para>
    </section>
    <section xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">
    <name xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Fast bit-reversal</name>
      <para xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="p8"><cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12016">Bit-reversing</cnxn> the input
        or output data can consume several percent of the total
        run-time of an FFT program.
        Several fast bit-reversal algorithms have been developed
        that can reduce this to two percent or less, including
        the method published by <cite xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" src="#Evans">D.M.W. Evans</cite>.
      </para>
    </section>
    <section xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">
    <name xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Trade additions for multiplications</name>
      <para xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="p2">When FFTs first became widely used, hardware multipliers
        were relatively rare on digital computers, and multiplications
        generally required many more cycles than additions.
        Methods to reduce multiplications, even at the expense
        of a substantial increase in additions, were often beneficial.
        The <cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12033">prime factor algorithms</cnxn>
        and the <cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12023">Winograd Fourier transform algorithms</cnxn>,
        which required fewer multiplies and considerably more additions
        than the <cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12059">power-of-two-length algorithms</cnxn>,
        were developed during this period.
        Current processors generally have high-speed pipelined
        hardware multipliers, so trading multiplies for additions
        is often no longer beneficial.
        In particular, most machines now support single-cycle
        multiply-accumulate (MAC) operations, so balancing the
        number of multiplies and adds and combining them into
        single-cycle MACs generally results in the fastest code.
        Thus, the prime-factor and Winograd FFTs are rarely used
        today unless the application requires FFTs of a specific length.
      </para>
      <para xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="p33">It is possible to implement a complex multiply with
	3 real multiplies and 5 real adds rather than the usual
        4 real multiplies and 2 real adds:
	<m:math display="block">
	  <m:apply>
	    <m:eq/>
	    <m:apply>
	      <m:times/>
	      <m:apply>
		<m:plus/>
		<m:ci>C</m:ci>
		<m:apply>
		  <m:times/>
		  <m:imaginaryi/>
		  <m:ci>S</m:ci>
		</m:apply>
	      </m:apply>
	      <m:apply>
		<m:plus/>
		<m:ci>X</m:ci>
		<m:apply>
		  <m:times/>
		  <m:imaginaryi/>
		  <m:ci>Y</m:ci>
		</m:apply>
	      </m:apply>
	    </m:apply>
	    <m:apply>
	      <m:plus/>
	      <m:apply>
		<m:minus/>
		<m:apply>
		  <m:times/>
		  <m:ci>C</m:ci>
		  <m:ci>X</m:ci>
		</m:apply>
		<m:apply>
		  <m:times/>
		  <m:ci>S</m:ci>
		  <m:ci>Y</m:ci>
		</m:apply>
	      </m:apply>
	      <m:apply>
		<m:times/>
		<m:imaginaryi/>
		<m:apply>
		  <m:plus/>
		  <m:apply>
		    <m:times/>
		    <m:ci>C</m:ci>
		    <m:ci>Y</m:ci>
		  </m:apply>
		  <m:apply>
		    <m:times/>
		    <m:ci>S</m:ci>
		    <m:ci>X</m:ci>
		  </m:apply>
		</m:apply>
	      </m:apply>
	    </m:apply>
	  </m:apply>
	</m:math>
	but alernatively
	<m:math display="block">
	  <m:apply>
	    <m:eq/>
	    <m:ci>Z</m:ci>
	    <m:apply>
	      <m:times/>
	      <m:ci>C</m:ci>
	      <m:apply>
		<m:minus/>
		<m:ci>X</m:ci>
		<m:ci>Y</m:ci>
	      </m:apply>
	    </m:apply>
	  </m:apply>
	</m:math>
	<m:math display="block">
	  <m:apply>
	    <m:eq/>
	    <m:ci>D</m:ci>
	    <m:apply>
	      <m:plus/>
	      <m:ci>C</m:ci>
	      <m:ci>S</m:ci>
	    </m:apply>
	  </m:apply>
	</m:math>
	<m:math display="block">
	  <m:apply>
	    <m:eq/>
	    <m:ci>E</m:ci>
	    <m:apply>
	      <m:minus/>
	      <m:ci>C</m:ci>
	      <m:ci>S</m:ci>
	    </m:apply>
	  </m:apply>
	</m:math>

	<m:math display="block">
	  <m:apply>
	    <m:eq/>
	    <m:apply>
	      <m:minus/>
	      <m:apply>
		<m:times/>
		<m:ci>C</m:ci>
		<m:ci>X</m:ci>
	      </m:apply>
	      <m:apply>
		<m:times/>
		<m:ci>S</m:ci>
		<m:ci>Y</m:ci>
	      </m:apply>
	    </m:apply>
	    <m:apply>
	      <m:plus/>
	      <m:apply>
		<m:times/>
		<m:ci>E</m:ci>
		<m:ci>Y</m:ci>
	      </m:apply>
	      <m:ci>Z</m:ci>
	    </m:apply>
	  </m:apply>
	</m:math>
	<m:math display="block">
	  <m:apply>
	    <m:eq/>
	    <m:apply>
	      <m:plus/>
	      <m:apply>
		<m:times/>
		<m:ci>C</m:ci>
		<m:ci>Y</m:ci>
	      </m:apply>
	      <m:apply>
		<m:times/>
		<m:ci>S</m:ci>
		<m:ci>X</m:ci>
	      </m:apply>
	    </m:apply>
	    <m:apply>
	      <m:minus/>
	      <m:apply>
		<m:times/>
		<m:ci>D</m:ci>
		<m:ci>X</m:ci>
	      </m:apply>
	      <m:ci>Z</m:ci>
	    </m:apply>
	  </m:apply>
	</m:math>
              In an FFT,
	<m:math><m:ci>D</m:ci></m:math> and
	<m:math><m:ci>E</m:ci></m:math> 
        come entirely from the twiddle factors,
        so they can be precomputed and stored in a look-up table.
        This reduces the cost of the complex twiddle-factor multiply
        to 3 real multiplies and 3 real adds, or one less and one more,
        respectively, than the conventional 4/2 computation.</para>
    </section>

    <section xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">
    <name xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Special butterflies</name>
    <para xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="element-992">Certain twiddle factors,
        namely
	<m:math>
        <m:apply>
        <m:eq/>
	  <m:ci>
	    <m:msubsup>
	      <m:mi>W</m:mi>
	      <m:mi>N</m:mi>
	      <m:mn>0</m:mn>
	    </m:msubsup>
	  </m:ci>
          <m:cn>1</m:cn>
        </m:apply>
	</m:math>,
	<m:math>
	  <m:ci>
	    <m:msubsup>
	      <m:mi>W</m:mi>
	      <m:mi>N</m:mi>
	      <m:mfrac>
		<m:mi>N</m:mi>
		<m:mn>2</m:mn>
	      </m:mfrac>
	    </m:msubsup>
	  </m:ci>
	</m:math>,
	<m:math>
	  <m:ci>
	    <m:msubsup>
	      <m:mi>W</m:mi>
	      <m:mi>N</m:mi>
	      <m:mfrac>
		<m:mi>N</m:mi>
		<m:mn>4</m:mn>
	      </m:mfrac>
	    </m:msubsup>
	  </m:ci>
	</m:math>,
	<m:math>
	  <m:ci>
	    <m:msubsup>
	      <m:mi>W</m:mi>
	      <m:mi>N</m:mi>
	      <m:mfrac>
		<m:mi>N</m:mi>
		<m:mn>8</m:mn>
	      </m:mfrac>
	    </m:msubsup>
	  </m:ci>
	</m:math>,
	<m:math>
	  <m:ci>
	    <m:msubsup>
	      <m:mi>W</m:mi>
	      <m:mi>N</m:mi>
	      <m:mfrac>
		<m:mrow>
		  <m:mn>3</m:mn>
		  <m:mi>N</m:mi>
		</m:mrow>
		<m:mn>8</m:mn>
	      </m:mfrac>
	    </m:msubsup>
	  </m:ci>
	</m:math>, etc., can be implemented
with no additional operations, or with fewer real operations than
a general complex multiply.
Programs that specially implement such butterflies in the most
efficient manner throughout the algorithm can reduce the
computational cost by up to several <m:math><m:ci>N</m:ci></m:math>
multiplies and additions in a length-<m:math><m:ci>N</m:ci></m:math>
FFT.
      </para>
      </section>
      <section xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">
      <name xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Practical Perspective</name>
      <para xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="element-269">When optimizing FFTs for speed, it can be important to maintain
perspective on the benefits that can be expected from
any given optimization.
The following list categorizes the various techniques by potential
benefit;
these will be somewhat situation- and machine-dependent, but clearly
one should begin with the most significant and put the most effort
where the pay-off is likely to be largest.</para><list xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="list2" type="bulleted"><name xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Methods to speed up computation of DFTs</name>
	<item xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/"><name xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Tremendous Savings</name>
        <list xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="listX" type="enumerated">
        <item xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">
	  FFT 
	  (<m:math>
	    <m:apply>
	      <m:divide/>
	      <m:ci>N</m:ci>
	      <m:apply>
		<m:log/>
		<m:logbase>
		  <m:cn>2</m:cn>
		</m:logbase>
		<m:ci>N</m:ci>
	      </m:apply>
	    </m:apply>
	  </m:math> savings)
          </item>
        </list>
	</item>
	<item xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/"><name xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Substantial Savings</name>
	  (<m:math>
	    <m:apply>
	      <m:geq/>
	      <m:ci>2</m:ci>
	    </m:apply>
	  </m:math>)
	  <list xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="list3" type="enumerated">
	    <item xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Table lookup of cosine/sine</item>
	    <item xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Compiler tricks/good programming</item>
	    <item xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Assembly-language programming</item>
	    <item xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Special-purpose hardware</item>
	    <item xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Real-data FFT for real data (factor of 2)</item>
	    <item xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Special cases</item>
	  </list>
	</item>
	<item xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">
	  <name xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Minor Savings</name>
	  <list xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" id="list4" type="enumerated"><item xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/"><cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12027">radix-4</cnxn>, <cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12031">split-radix</cnxn> (-10% - +30%)</item>
            <item xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">special butterflies</item>
	    <item xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">
	      3-real-multiplication complex multiply
	    </item>
	    <item xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/">Fast bit-reversal (up to 6%)</item>
	  </list>
	</item>
      </list>
    </section>
    <note xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" type="fact">On general-purpose machines, computation is only part
      of the total run time.  Address generation, indexing, data
      shuffling, and memory access take up much or most of the cycles.
    </note>
    <note xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" type="fact">A well-written <cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12016">radix-2</cnxn> program will run much faster than a poorly written
      <cnxn xmlns:md="http://cnx.rice.edu/mdml/0.4" xmlns:m="http://www.w3.org/1998/Math/MathML" xmlns:bib="http://bibtexml.sf.net/" document="m12031">split-radix</cnxn> program!
    </note>
    
  </content>
  <bib:file>
    <bib:entry id="Evans">
      <bib:article>
	<bib:author>D.M.W. Evans</bib:author>
	<bib:title>An improved digit-reversal permutation algorithm
	for the fast Fourier and Hartley transforms</bib:title>
	<bib:journal>IEEE Transactions on Signal Processing</bib:journal>
	<bib:year>1987</bib:year>
	<bib:volume>35</bib:volume>
	<bib:number>8</bib:number>
	<bib:pages>1120-1125</bib:pages>
	<bib:month>August</bib:month>
      </bib:article>
    </bib:entry>
    <bib:entry id="Frigo">
      <bib:article>
	<bib:author>M. Frigo and S.G. Johnson</bib:author>
	<bib:title>The design and implementation of FFTW3</bib:title>
	<bib:journal>Proceedings of the IEEE</bib:journal>
	<bib:year>2005</bib:year>
	<bib:volume>93</bib:volume>
	<bib:number>2</bib:number>
	<bib:pages>216-231</bib:pages>
	<bib:month>February</bib:month>
      </bib:article>
    </bib:entry>
    <bib:entry id="Meyer">
      <bib:article>
	<bib:author>R. Meyer, H.W. Schuessler, and K. Schwarz</bib:author>
	<bib:title>FFT Implmentation on DSP chips - Theory and Practice</bib:title>
	<bib:journal>IEEE International Conference on Acoustics, Speech, and Signal Processing</bib:journal>
	<bib:year>1990</bib:year>
      </bib:article>
    </bib:entry>
    <bib:entry id="Sorensen2">
      <bib:article>
	<bib:author>H.V. Sorensen, D.L Jones, M.T. Heideman, and
	C.S. Burrus</bib:author>
	<bib:title>Real-valued fast Fourier transform algorithms</bib:title>
	<bib:journal>IEEE Transactions on Signal Processing</bib:journal>
	<bib:year>1987</bib:year>
	<bib:volume>35</bib:volume>
	<bib:number>6</bib:number>
	<bib:pages>849-863</bib:pages>
	<bib:month>June</bib:month>
      </bib:article>
    </bib:entry>
    <bib:entry id="Sorensen3">
      <bib:article>
	<bib:author>H.V. Sorensen and C.S. Burrus</bib:author>
	<bib:title>Efficient computation of the DFT with only a subset of input or output points</bib:title>
	<bib:journal>IEEE Transactions on Signal Processing</bib:journal>
	<bib:year>1993</bib:year>
	<bib:volume>41</bib:volume>
	<bib:number>3</bib:number>
	<bib:pages>1184-1200</bib:pages>
	<bib:month>March</bib:month>
      </bib:article>
    </bib:entry>
  </bib:file>
</document>
