X-Git-Url: https://scm.cri.ensmp.fr/git/Faustine.git/blobdiff_plain/c7f552fd8888da2f0d8cfb228fe0f28d3df3a12c..b4b6f2ea75b9f0f3ca918f5b84016610bf7a4d4f:/interpretor/preprocessor/faust-0.9.47mr3/documentation/faust-quick-reference-src/chapters/codegeneration.tex diff --git a/interpretor/preprocessor/faust-0.9.47mr3/documentation/faust-quick-reference-src/chapters/codegeneration.tex b/interpretor/preprocessor/faust-0.9.47mr3/documentation/faust-quick-reference-src/chapters/codegeneration.tex new file mode 100644 index 0000000..f58426e --- /dev/null +++ b/interpretor/preprocessor/faust-0.9.47mr3/documentation/faust-quick-reference-src/chapters/codegeneration.tex @@ -0,0 +1,413 @@ +\chapter{Controlling the code generation} +Several options of the \faust compiler allow to control the generated C++ code. By default the computations are done sample by sample in a single loop. But the compiler can also generate \textit{vector} and \textit{parallel} code. + + +\section{Vector Code generation} +Modern C++ compilers are able to do autovectorization, that is to use SIMD instructions to speedup the code. These instructions can typically operate in parallel on short vectors of 4 simple precision floating point numbers thus leading to a theoretical speedup of $\times4$. +Autovectorization of C/C++ programs is a difficult task. Current compilers are very sensitive to the way the code is arranged. In particular too complex loops can prevent autovectorization. The goal of the vector code generation is to rearrange the C++ code in a way that facilitates the autovectorization job of the C++ compiler. Instead of generating a single sample computation loop, it splits the computation into several simpler loops that communicates by vectors. + +The vector code generation is activated by passing the \lstinline!--vectorize! (or \lstinline!-vec!) option to the \faust compiler. Two additional options are available: \lstinline!--vec-size ! controls the size of the vector (by default 32 samples) and \lstinline!--loop-variant 0/1! gives some additional control on the loops. + +To illustrate the difference between scalar code and vector code, let's take the computation of the RMS (Root Mean Square) value of a signal. Here is the \faust code that computes the Root Mean Square of a sliding window of 1000 samples: +\label{rms} +\begin{lstlisting} +// Root Mean Square of n consecutive samples +RMS(n) = square : mean(n) : sqrt ; + +// Square of a signal +square(x) = x * x ; + +// Mean of n consecutive samples of a signal +// (uses fixpoint to avoid the accumulation of +// rounding errors) +mean(n) = float2fix : integrate(n) : + fix2float : /(n); + +// Sliding sum of n consecutive samples +integrate(n,x) = x - x@n : +~_ ; + +// Convertion between float and fix point +float2fix(x) = int(x*(1<<20)); +fix2float(x) = float(x)/(1<<20); + +// Root Mean Square of 1000 consecutive samples +process = RMS(1000) ; +\end{lstlisting} + +The compute() method generated in scalar mode is the following: + +\begin{lstlisting} +virtual void compute (int count, + float** input, + float** output) +{ + float* input0 = input[0]; + float* output0 = output[0]; + for (int i=0; iinput = input; + this->output = output; + StartMeasure(); + for (fIndex = 0; fIndex < fullcount; fIndex += 32) { + fFullCount = min (32, fullcount-fIndex); + TaskQueue::Init(); + // Initialize end task + fGraph.InitTask(1,1); + // Only initialize tasks with inputs + fGraph.InitTask(4,2); + fIsFinished = false; + fThreadPool.SignalAll(fDynamicNumThreads - 1); + computeThread(0); + while (!fThreadPool.IsFinished()) {} + } + StopMeasure(fStaticNumThreads, + fDynamicNumThreads); +} +void computeThread (int cur_thread) { + float* fRec0 = &fRec0_tmp[4]; + float* fRec1 = &fRec1_tmp[4]; + // Init graph state + { + TaskQueue taskqueue; + int tasknum = -1; + int count = fFullCount; + // Init input and output + FAUSTFLOAT* input0 = &input[0][fIndex]; + FAUSTFLOAT* input1 = &input[1][fIndex]; + FAUSTFLOAT* output0 = &output[0][fIndex]; + int task_list_size = 2; + int task_list[2] = {2,3}; + taskqueue.InitTaskList(task_list_size, task_list, fDynamicNumThreads, cur_thread, tasknum); + while (!fIsFinished) { + switch (tasknum) { + case WORK_STEALING_INDEX: { + tasknum = TaskQueue::GetNextTask(cur_thread); + break; + } + case LAST_TASK_INDEX: { + fIsFinished = true; + break; + } + // SECTION : 1 + case 2: { + // LOOP 0x101111680 + // pre processing + for (int i=0; i<4; i++) fRec0_tmp[i]=fRec0_perm[i]; + // exec code + for (int i=0; i