FORM 4.3
|
#include <vector>
#include <stack>
#include <algorithm>
#include <set>
#include <map>
#include <climits>
#include <cmath>
#include <string>
#include <iostream>
#include <tr1/unordered_map>
#include <tr1/unordered_set>
#include "form3.h"
#include "mytime.h"
Go to the source code of this file.
Data Structures | |
class | tree_node |
struct | CSEHash |
struct | CSEEq |
struct | node |
struct | NodeHash |
struct | NodeEq |
class | optimization |
Typedefs | |
typedef struct node | NODE |
Functions | |
void | print_instr (const vector< WORD > &instr, WORD num) |
template<class RandomAccessIterator> | |
void | my_random_shuffle (PHEAD RandomAccessIterator fr, RandomAccessIterator to) |
LONG | get_expression (int exprnr) |
vector< vector< WORD > > | get_brackets () |
int | count_operators (const WORD *expr, bool print=false) |
int | count_operators (const vector< WORD > &instr, bool print=false) |
vector< WORD > | occurrence_order (const WORD *expr, bool rev) |
WORD | getpower (const WORD *term, int var, int pos) |
void | fixarg (UWORD *t, WORD &n) |
void | GcdLong_fix_args (PHEAD UWORD *a, WORD na, UWORD *b, WORD nb, UWORD *c, WORD *nc) |
void | DivLong_fix_args (UWORD *a, WORD na, UWORD *b, WORD nb, UWORD *c, WORD *nc, UWORD *d, WORD *nd) |
void | build_Horner_tree (const WORD **terms, int numterms, int var, int maxvar, int pos, vector< WORD > *res) |
bool | term_compare (const WORD *a, const WORD *b) |
vector< WORD > | Horner_tree (const WORD *expr, const vector< WORD > &order) |
void | print_tree (const vector< WORD > &tree) |
template<typename T> | |
size_t | hash_range (T *array, int size) |
vector< WORD > | generate_instructions (const vector< WORD > &tree, bool do_CSE) |
int | count_operators_cse (const vector< WORD > &tree) |
NODE * | buildTree (vector< WORD > &tree) |
int | count_operators_cse_topdown (vector< WORD > &tree) |
vector< WORD > | simulated_annealing () |
void | find_Horner_MCTS_expand_tree () |
void | find_Horner_MCTS () |
vector< WORD > | merge_operators (const vector< WORD > &all_instr, bool move_coeff) |
vector< optimization > | find_optimizations (const vector< WORD > &instr) |
bool | do_optimization (const optimization optim, vector< WORD > &instr, int newid) |
int | partial_factorize (vector< WORD > &instr, int n, int improve) |
vector< WORD > | optimize_greedy (vector< WORD > instr, LONG time_limit) |
vector< WORD > | recycle_variables (const vector< WORD > &all_instr) |
void | optimize_expression_given_Horner () |
VOID | generate_output (const vector< WORD > &instr, int exprnr, int extraoffset, const vector< vector< WORD > > &brackets) |
WORD | generate_expression (WORD exprnr) |
VOID | optimize_print_code (int print_expr) |
int | Optimize (WORD exprnr, int do_print) |
int | ClearOptimize () |
Variables | |
const WORD | OPER_ADD = -1 |
const WORD | OPER_MUL = -2 |
const WORD | OPER_COMMA = -3 |
WORD * | optimize_expr |
vector< vector< WORD > > | optimize_best_Horner_schemes |
int | optimize_num_vars |
int | optimize_best_num_oper |
vector< WORD > | optimize_best_instr |
vector< WORD > | optimize_best_vars |
bool | mcts_factorized |
bool | mcts_separated |
vector< WORD > | mcts_vars |
tree_node | mcts_root |
int | mcts_expr_score |
set< pair< int, vector< WORD > > > | mcts_best_schemes |
experimental routines for the optimization of FORTRAN or C output.
Definition in file optimize.cc.
void print_instr | ( | const vector< WORD > & | instr, |
WORD | num ) |
Definition at line 158 of file optimize.cc.
void my_random_shuffle | ( | PHEAD RandomAccessIterator | fr, |
RandomAccessIterator | to ) |
Random shuffle
Randomly permutes elements in the range [fr,to). Functionality is the same as C++'s "random_shuffle", but it uses Form's "wranf".
Definition at line 180 of file optimize.cc.
Referenced by find_Horner_MCTS().
LONG get_expression | ( | int | exprnr | ) |
Get expression
Reads an expression from the input file into a buffer (called optimize_expr). This buffer is used during the optimization process. Non-symbols are removed by ConvertToPoly and are put in temporary symbols.
The return value is the length of the expression in WORDs, or a negative number if it failed.
Definition at line 204 of file optimize.cc.
References EndSort(), LowerSortLevel(), NewSort(), and StoreTerm().
Referenced by Optimize().
vector< vector< WORD > > get_brackets | ( | ) |
Get brackets
Checks whether the input expression (stored in optimize_expr) contains brackets. If so, this method replaces terms outside brackets by powers of SEPERATESYMBOL (equal brackets have equal powers) and the brackets are returned. If not, the result is empty.
Brackets are used for simultaneous optimization. The symbol SEPARATESYMBOL is always the first one used in a Horner scheme.
Definition at line 281 of file optimize.cc.
Referenced by Optimize().
int count_operators | ( | const WORD * | expr, |
bool | print = false ) |
Count operators
Counts the number of operators in a Form-style expression.
Definition at line 401 of file optimize.cc.
Referenced by find_Horner_MCTS(), generate_instructions(), Optimize(), optimize_expression_given_Horner(), and optimize_greedy().
int count_operators | ( | const vector< WORD > & | instr, |
bool | print = false ) |
Count operators
Counts the number of operators in a vector of instructions
Definition at line 455 of file optimize.cc.
vector< WORD > occurrence_order | ( | const WORD * | expr, |
bool | rev ) |
Occurrence order
Extracts all variables from an expression and sorts them with most occurring first (or last, with rev=true)
Definition at line 498 of file optimize.cc.
Referenced by Optimize().
WORD getpower | ( | const WORD * | term, |
int | var, | ||
int | pos ) |
Horner tree building
Given a Form-style expression (in a buffer in memory), this builds an expression tree. The tree is determined by a multivariate Horner scheme, i.e., something of the form:
1+y+x*(2+y*(1+y)+x*(3-y*(...)))
The order of the variables is given to the method "Horner_tree", which renumbers ad reorders the terms of the expression. Next, the recursive method "build_Horner_tree" does the actual tree construction.
The tree is represented in postfix notation. Tokens are of the following forms:
Sets AN.poly_num_vars and allocates AN.poly_vars. The latter should be freed later. Get power of variable (helper function for build_Horner_tree)
Returns the power of the variable "var", which is at position "pos" in this term, if it is present.
Definition at line 579 of file optimize.cc.
Referenced by build_Horner_tree().
void fixarg | ( | UWORD * | t, |
WORD & | n ) |
Call GcdLong/DivLong with leading zeroes
These method remove leading zeroes, so that GcdLong and DivLong can safely be called.
Definition at line 593 of file optimize.cc.
void GcdLong_fix_args | ( | PHEAD UWORD * | a, |
WORD | na, | ||
UWORD * | b, | ||
WORD | nb, | ||
UWORD * | c, | ||
WORD * | nc ) |
Definition at line 599 of file optimize.cc.
void DivLong_fix_args | ( | UWORD * | a, |
WORD | na, | ||
UWORD * | b, | ||
WORD | nb, | ||
UWORD * | c, | ||
WORD * | nc, | ||
UWORD * | d, | ||
WORD * | nd ) |
Definition at line 605 of file optimize.cc.
void build_Horner_tree | ( | const WORD ** | terms, |
int | numterms, | ||
int | var, | ||
int | maxvar, | ||
int | pos, | ||
vector< WORD > * | res ) |
Build the Horner tree
Constructs the Horner tree. The method processes one variable and continues recursively until the Horner scheme is completed.
"terms" is a pointer to the starts of the terms. "numterms" is the number of terms to be processed. "var" is the next variable to be processed (index between 0 and #maxvar) and "maxvar" is the last variable to be processed, so that partial Horner trees can also be constructed. "pos" is the position that the power of "var" should be in (one level further in the recursion, "pos" has increased by 0 or 1 depending on whether the previous power was 0 or not). The result is written at the pointer "res".
This method also factors out gcds of the coefficients. The result should end with "gcd OPER_MUL" at all times, so that one level higher gcds can be extracted again.
Definition at line 631 of file optimize.cc.
References build_Horner_tree(), and getpower().
Referenced by build_Horner_tree(), and Horner_tree().
bool term_compare | ( | const WORD * | a, |
const WORD * | b ) |
Term compare (helper function for Horner_tree)
Compares two terms of the form "L SYM 4 x n coeff" or "L coeff". Lower powers of lower-indexed symbols come first. This is used to sort the terms in correct order.
Definition at line 831 of file optimize.cc.
Referenced by Horner_tree().
vector< WORD > Horner_tree | ( | const WORD * | expr, |
const vector< WORD > & | order ) |
Prepare Horner tree building
This method renumbers the variables to 0...#vars-1 according to the specified order. Next, it stored pointer to individual terms and sorts the terms with higher power first. Then the sorted lists of power is used for the construction of the Horner tree.
Definition at line 852 of file optimize.cc.
References build_Horner_tree(), and term_compare().
Referenced by optimize_expression_given_Horner().
void print_tree | ( | const vector< WORD > & | tree | ) |
Definition at line 955 of file optimize.cc.
size_t hash_range | ( | T * | array, |
int | size ) |
Definition at line 1024 of file optimize.cc.
vector< WORD > generate_instructions | ( | const vector< WORD > & | tree, |
bool | do_CSE ) |
Generate instructions
Converts the expression tree to a list of instructions that directly translate to code. Instructions are of the form:
expr.nr operator length [operands]+ trailing.zero
The operands are of the form:
length [(EXTRA)SYMBOL length variable power] coeff
This method only generates binary operators. Merging is done later. The method also checks for common subexpressions and eliminates them and the flag "do_CSE" is set.
The map "ID" keeps track of which subexpressions already exist. The key is formatted as one of the following:
SYMBOL x n SNUMBER coeff OPERATOR LHS RHS
with LHS/RHS formatted as one of the following:
SNUMBER idx 0 (EXTRA)SYMBOL x n
ID[symbol] or ID[operator] equals a subexpression number. ID[coeff] equals the position of the number in the input.
The stack s is used the process the postfix expression tree. Three-word tokens of the form:
SNUMBER idx.of.coeff 0 SYMBOL x n EXTRASYMBOL x 1
are pushed onto it. Operators pop two operands and push the resulting expression.
(Extra)symbols are 1-indexed, because -X is also needed to represent -1 times this term.
There is currently a bug. The notation cannot tell if there is a single bracket and then ignores the bracket.
TODO: check if this method performs properly if do_CSE=false
Definition at line 1086 of file optimize.cc.
References count_operators().
Referenced by optimize_expression_given_Horner().
int count_operators_cse | ( | const vector< WORD > & | tree | ) |
Count number of operators in a binary tree, while removing CSEs on the fly. The instruction set is not created, which makes this method slightly faster.
A hash is created on the fly and is passed through the stack. TODO: find better hash functions
Definition at line 1295 of file optimize.cc.
NODE * buildTree | ( | vector< WORD > & | tree | ) |
Definition at line 1517 of file optimize.cc.
int count_operators_cse_topdown | ( | vector< WORD > & | tree | ) |
Definition at line 1579 of file optimize.cc.
vector< WORD > simulated_annealing | ( | ) |
Definition at line 1634 of file optimize.cc.
void find_Horner_MCTS_expand_tree | ( | ) |
Definition at line 2007 of file optimize.cc.
void find_Horner_MCTS | ( | ) |
Find best Horner schemes using MCTS
The method governs the MCTS for the best Horner schemes. It does some pre-processing, calls "find_Horner_MCTS_expand_tree" a number of times and does some post-processing.
Definition at line 2208 of file optimize.cc.
References count_operators(), my_random_shuffle(), and TimeWallClock().
Referenced by Optimize().
vector< WORD > merge_operators | ( | const vector< WORD > & | all_instr, |
bool | move_coeff ) |
Merge operators
The input instructions form a binary DAG. This method merges expressions like
Z1 = a+b; Z2 = Z1+c;
into
Z2 = a+b+c;
An instruction is merged iff it only has one parent and the operator equals its parent's operator.
This still leaves some freedom: where should the coefficients end up in cases as:
Z1 = Z2 + x <=> Z1 = 2*Z2 + x Z2 = 2*x*y Z2 = x*y
Both are relevant, e.g. for CSE of the form "2*x" and "2*Z2". The flag "move_coeff" moves coefficients from LHS-like expressions to RHS-like expressions.
Furthermore, this method removes empty equation (Z1=0), that are introduced by some "optimize_greedy" substitutions.
Expressions are mostly traversed via a stack, so that parents are evaluated before their children.
With "move_coeff" set coefficients are moved, but this leads to some tricky cases, e.g.
Z1 = Z2 + x Z2 = 2*y
Here Z2 reduces to the trivial equation Z2=y, which should be eliminated. Here the array skip[i] comes in.
Furthermore in the case
Z1 = Z2 + x Z2 = 2*Z3 Z3 = x*Z4 Z4 = y*z
after substituting Z1 = 2*Z3 + x, the parent expression for Z4 becomes Z3 instead of Z2. This is where renum_par[i] comes in.
Finally, once a coefficient has been moved, skip_coeff[i] is set and this coefficient is copied into the new expression anymore.
Definition at line 2356 of file optimize.cc.
Referenced by optimize_expression_given_Horner().
vector< optimization > find_optimizations | ( | const vector< WORD > & | instr | ) |
Find optimizations
This method find all optimization of the form described in "class Optimization". It process every equation, looking for possible optimizations and stores them in a fast-access data structure to count the total improvement of an optimization.
Definition at line 2651 of file optimize.cc.
Referenced by optimize_greedy().
bool do_optimization | ( | const optimization | optim, |
vector< WORD > & | instr, | ||
int | newid ) |
Do optimization
This method performs an optimization. It scans through the equations of "optim.eqnidxs" and looks in which this optimization can still be performed (due to other performed optimizations this isn't always the case). If possible, it substitutes the common subexpression by a new extra symbol numbered "newid". Finally, the new extrasymbol is defined accordingly.
Substitutions may lead to trivial equations of the form "Zi=Zj", but these are removed in the end of the method. The method returns whether the substitution has been done once or more (or not).
Definition at line 2884 of file optimize.cc.
Referenced by optimize_greedy().
int partial_factorize | ( | vector< WORD > & | instr, |
int | n, | ||
int | improve ) |
Partial factorization of instructions
This method performs partial factorization of instructions. In particular the following instructions
Z1 = x*a*b Z2 = x*c*d*e Z3 = 2*x + Z1 + Z2 + more
are replaced by
Z1 = a*b Z2 = c*d*e Z3 = Zj + more Zi = 2 + Z1 + Z2 Zj = x*Zi
Here it is necessary that no other equations refer to Z1 and Z2. The generation of trivial instructions (Zi=Zj or Zi=x) is prevented.
Definition at line 3433 of file optimize.cc.
Referenced by optimize_greedy().
vector< WORD > optimize_greedy | ( | vector< WORD > | instr, |
LONG | time_limit ) |
Optimize instructions greedily
This method optimizes an expression greedily. It calls "find_optimizations" to obtain candidates and performs the best one(s) by calling "do_optimization".
How many different optimization are done, before "find_optimization" is called again, is determined by the settings "greedyminnum" and "greedymaxperc".
During the optimization process, sequences of zeroes are introduced in the instructions, since moving all instructions when one gets optimized, is very costly. Therefore, in the end, the instructions are "compressed" again to remove these extra zeroes.
Definition at line 3727 of file optimize.cc.
References count_operators(), do_optimization(), find_optimizations(), partial_factorize(), and TimeWallClock().
Referenced by optimize_expression_given_Horner().
vector< WORD > recycle_variables | ( | const vector< WORD > & | all_instr | ) |
Recycle variables
The current input uses many temporary variables. Many of them become obsolete at some point during the evaluation of the code, so can be recycled. This method renumbers the temporary variables, so that they are recycled. Furthermore, the input is order in depth-first order, so that the instructions can be performed consecutively.
First, for each subDAG, an estimate for the number of variables needed is made. This is done by the following recursive formula:
#vars(x) = max(#vars(ch_i(x)) + i),
with ch_i(x) the i-th child of x, where the childs are ordered w.r.t. #vars(ch_i). This formula is exact if the input forms a tree, and otherwise gives a reasonable estimate.
Then, the instructions are reordered in a depth-first order with childs ordered w.r.t. #vars. Next, the times that variables become obsolete are found. Each LHS of an instruction is renumbered to the lowest-numbered temporary variable that is available at that time.
Definition at line 3867 of file optimize.cc.
Referenced by optimize_expression_given_Horner().
void optimize_expression_given_Horner | ( | ) |
Optimize expression given a Horner scheme
This method picks one Horner scheme from the list of best Horner schemes, applies this scheme to the expression and then, depending on optimize.settings, does a common subexpression elimination (CSE) or performs greedy optimizations.
CSE is fast, while greedy might be slow. CSE followed by greedy is faster than greedy alone, but typically results in slightly worse code (not proven; just observed).
eventually do greedy optimations
Definition at line 4014 of file optimize.cc.
References count_operators(), generate_instructions(), Horner_tree(), merge_operators(), optimize_greedy(), recycle_variables(), and TimeWallClock().
Referenced by Optimize().
VOID generate_output | ( | const vector< WORD > & | instr, |
int | exprnr, | ||
int | extraoffset, | ||
const vector< vector< WORD > > & | brackets ) |
Generate output
This method prepares the instructions for printing. They are stored in Form format, so that they can be printed by "PrintExtraSymbol". The results are stored in the buffer AO.OptimizeResult.
Definition at line 4245 of file optimize.cc.
Referenced by Optimize().
WORD generate_expression | ( | WORD | exprnr | ) |
Generate expression
This method modifies the original optimized expression by an expression with extra symbols. This is used for "#Optimize".
Definition at line 4394 of file optimize.cc.
References EndSort(), Generator(), LowerSortLevel(), NewSort(), and PutOut().
Referenced by Optimize().
VOID optimize_print_code | ( | int | print_expr | ) |
Print optimized code
This method prints the optimized code via "PrintExtraSymbol". Depending on the flag, the original expression is printed (for "Print") or not (for "#Optimize / #write "O").
Definition at line 4474 of file optimize.cc.
Referenced by Optimize().
int Optimize | ( | WORD | exprnr, |
int | do_print ) |
Optimization of expression
This method takes an input expression and generates optimized code to calculate its value. The following methods are called to do so:
(1) get_expression : to read to expression
(2) get_brackets : find brackets for simultaneous optimization
(3) occurrence_order or find_Horner_MCTS : to determine (the) Horner scheme(s) to use; this depends on AO.optimize.horner
(4) optimize_expression_given_Horner : to do the optimizations for each Horner scheme; this method does either CSE or greedy optimizations dependings on AO.optimize.method
(5) generate_output : to format the output in Form notation and store it in a buffer
(6a) optimize_print_code : to print the expression (for "Print") or (6b) generate_expression : to modify the expression (for "#Optimize")
On ParFORM, all the processes must call this function at the same time. Then
(1) Because only the master can access to the expression to be optimized, the master broadcast the expression to all the slaves after reading the expression (PF_get_expression).
(2) get_brackets reads optimize_expr as the input and it works also on the slaves. We leave it although the bracket information is not needed on the slaves (used in (5) on the master).
(3) and (4) find_Horner_MCTS and optimize_expression_given_Horner are parallelized.
(5), (6a) and (6b) are needed only on the master.
Definition at line 4587 of file optimize.cc.
References ClearOptimize(), count_operators(), find_Horner_MCTS(), generate_expression(), generate_output(), get_brackets(), get_expression(), occurrence_order(), optimize_expression_given_Horner(), optimize_print_code(), PF_Broadcast(), PF_LongMultiBroadcast(), PF_Pack(), PF_PrepareLongMultiPack(), PF_PreparePack(), PF_Unpack(), and PutPreVar().
int ClearOptimize | ( | ) |
Optimization of expression
Clears the buffers that were used for optimization output. Clears the expression from the buffers (marks it to be dropped). Note: we need to use the expression by its name, because numbers may change if we drop other expressions between when we do the optimizations and clear the results (in execute.c). Also this is not 100% safe, because we could overwrite the optimized expression. But that can be done only in a Local or Global statement and hence we only have to test there that we might have to call ClearOptimize first. (in file comexpr.c)
Definition at line 4924 of file optimize.cc.
References PutPreVar().
Referenced by Optimize().
const WORD OPER_ADD = -1 |
Definition at line 116 of file optimize.cc.
const WORD OPER_MUL = -2 |
Definition at line 117 of file optimize.cc.
const WORD OPER_COMMA = -3 |
Definition at line 118 of file optimize.cc.
WORD* optimize_expr |
Definition at line 135 of file optimize.cc.
vector<vector<WORD> > optimize_best_Horner_schemes |
Definition at line 136 of file optimize.cc.
int optimize_num_vars |
Definition at line 137 of file optimize.cc.
int optimize_best_num_oper |
Definition at line 138 of file optimize.cc.
vector<WORD> optimize_best_instr |
Definition at line 139 of file optimize.cc.
vector<WORD> optimize_best_vars |
Definition at line 140 of file optimize.cc.
bool mcts_factorized |
Definition at line 143 of file optimize.cc.
bool mcts_separated |
Definition at line 143 of file optimize.cc.
vector<WORD> mcts_vars |
Definition at line 144 of file optimize.cc.
tree_node mcts_root |
Definition at line 145 of file optimize.cc.
int mcts_expr_score |
Definition at line 146 of file optimize.cc.
set<pair<int,vector<WORD> > > mcts_best_schemes |
Definition at line 147 of file optimize.cc.