1 /*
2  * Copyright (C) 2016 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
18 #define ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
19 
20 #include "base/scoped_arena_allocator.h"
21 #include "base/scoped_arena_containers.h"
22 #include "induction_var_range.h"
23 #include "loop_analysis.h"
24 #include "nodes.h"
25 #include "optimization.h"
26 #include "superblock_cloner.h"
27 
28 namespace art {
29 
30 class CompilerOptions;
31 class ArchNoOptsLoopHelper;
32 
33 /**
34  * Loop optimizations. Builds a loop hierarchy and applies optimizations to
35  * the detected nested loops, such as removal of dead induction and empty loops
36  * and inner loop vectorization.
37  */
38 class HLoopOptimization : public HOptimization {
39  public:
40   HLoopOptimization(HGraph* graph,
41                     const CompilerOptions* compiler_options,
42                     HInductionVarAnalysis* induction_analysis,
43                     OptimizingCompilerStats* stats,
44                     const char* name = kLoopOptimizationPassName);
45 
46   bool Run() override;
47 
48   static constexpr const char* kLoopOptimizationPassName = "loop_optimization";
49 
50  private:
51   /**
52    * A single loop inside the loop hierarchy representation.
53    */
54   struct LoopNode : public ArenaObject<kArenaAllocLoopOptimization> {
LoopNodeLoopNode55     explicit LoopNode(HLoopInformation* lp_info)
56         : loop_info(lp_info),
57           outer(nullptr),
58           inner(nullptr),
59           previous(nullptr),
60           next(nullptr) {}
61     HLoopInformation* loop_info;
62     LoopNode* outer;
63     LoopNode* inner;
64     LoopNode* previous;
65     LoopNode* next;
66   };
67 
68   /*
69    * Vectorization restrictions (bit mask).
70    */
71   enum VectorRestrictions {
72     kNone            = 0,        // no restrictions
73     kNoMul           = 1 << 0,   // no multiplication
74     kNoDiv           = 1 << 1,   // no division
75     kNoShift         = 1 << 2,   // no shift
76     kNoShr           = 1 << 3,   // no arithmetic shift right
77     kNoHiBits        = 1 << 4,   // "wider" operations cannot bring in higher order bits
78     kNoSignedHAdd    = 1 << 5,   // no signed halving add
79     kNoUnroundedHAdd = 1 << 6,   // no unrounded halving add
80     kNoAbs           = 1 << 7,   // no absolute value
81     kNoStringCharAt  = 1 << 8,   // no StringCharAt
82     kNoReduction     = 1 << 9,   // no reduction
83     kNoSAD           = 1 << 10,  // no sum of absolute differences (SAD)
84     kNoWideSAD       = 1 << 11,  // no sum of absolute differences (SAD) with operand widening
85     kNoDotProd       = 1 << 12,  // no dot product
86   };
87 
88   /*
89    * Vectorization mode during synthesis
90    * (sequential peeling/cleanup loop or vector loop).
91    */
92   enum VectorMode {
93     kSequential,
94     kVector
95   };
96 
97   /*
98    * Representation of a unit-stride array reference.
99    */
100   struct ArrayReference {
101     ArrayReference(HInstruction* b, HInstruction* o, DataType::Type t, bool l, bool c = false)
baseArrayReference102         : base(b), offset(o), type(t), lhs(l), is_string_char_at(c) { }
103     bool operator<(const ArrayReference& other) const {
104       return
105           (base < other.base) ||
106           (base == other.base &&
107            (offset < other.offset || (offset == other.offset &&
108                                       (type < other.type ||
109                                        (type == other.type &&
110                                         (lhs < other.lhs ||
111                                          (lhs == other.lhs &&
112                                           is_string_char_at < other.is_string_char_at)))))));
113     }
114     HInstruction* base;      // base address
115     HInstruction* offset;    // offset + i
116     DataType::Type type;     // component type
117     bool lhs;                // def/use
118     bool is_string_char_at;  // compressed string read
119   };
120 
121   //
122   // Loop setup and traversal.
123   //
124 
125   bool LocalRun();
126   void AddLoop(HLoopInformation* loop_info);
127   void RemoveLoop(LoopNode* node);
128 
129   // Traverses all loops inner to outer to perform simplifications and optimizations.
130   // Returns true if loops nested inside current loop (node) have changed.
131   bool TraverseLoopsInnerToOuter(LoopNode* node);
132 
133   //
134   // Optimization.
135   //
136 
137   void SimplifyInduction(LoopNode* node);
138   void SimplifyBlocks(LoopNode* node);
139 
140   // Performs optimizations specific to inner loop with finite header logic (empty loop removal,
141   // unrolling, vectorization). Returns true if anything changed.
142   bool TryOptimizeInnerLoopFinite(LoopNode* node);
143 
144   // Performs optimizations specific to inner loop. Returns true if anything changed.
145   bool OptimizeInnerLoop(LoopNode* node);
146 
147   // Tries to apply loop unrolling for branch penalty reduction and better instruction scheduling
148   // opportunities. Returns whether transformation happened. 'generate_code' determines whether the
149   // optimization should be actually applied.
150   bool TryUnrollingForBranchPenaltyReduction(LoopAnalysisInfo* analysis_info,
151                                              bool generate_code = true);
152 
153   // Tries to apply loop peeling for loop invariant exits elimination. Returns whether
154   // transformation happened. 'generate_code' determines whether the optimization should be
155   // actually applied.
156   bool TryPeelingForLoopInvariantExitsElimination(LoopAnalysisInfo* analysis_info,
157                                                   bool generate_code = true);
158 
159   // Tries to perform whole loop unrolling for a small loop with a small trip count to eliminate
160   // the loop check overhead and to have more opportunities for inter-iteration optimizations.
161   // Returns whether transformation happened. 'generate_code' determines whether the optimization
162   // should be actually applied.
163   bool TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool generate_code = true);
164 
165   // Tries to apply scalar loop peeling and unrolling.
166   bool TryPeelingAndUnrolling(LoopNode* node);
167 
168   //
169   // Vectorization analysis and synthesis.
170   //
171 
172   bool ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count);
173   void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count);
174   void GenerateNewLoop(LoopNode* node,
175                        HBasicBlock* block,
176                        HBasicBlock* new_preheader,
177                        HInstruction* lo,
178                        HInstruction* hi,
179                        HInstruction* step,
180                        uint32_t unroll);
181   bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code);
182   bool VectorizeUse(LoopNode* node,
183                     HInstruction* instruction,
184                     bool generate_code,
185                     DataType::Type type,
186                     uint64_t restrictions);
187   uint32_t GetVectorSizeInBytes();
188   bool TrySetVectorType(DataType::Type type, /*out*/ uint64_t* restrictions);
189   bool TrySetVectorLength(uint32_t length);
190   void GenerateVecInv(HInstruction* org, DataType::Type type);
191   void GenerateVecSub(HInstruction* org, HInstruction* offset);
192   void GenerateVecMem(HInstruction* org,
193                       HInstruction* opa,
194                       HInstruction* opb,
195                       HInstruction* offset,
196                       DataType::Type type);
197   void GenerateVecReductionPhi(HPhi* phi);
198   void GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction);
199   HInstruction* ReduceAndExtractIfNeeded(HInstruction* instruction);
200   void GenerateVecOp(HInstruction* org,
201                      HInstruction* opa,
202                      HInstruction* opb,
203                      DataType::Type type);
204 
205   // Vectorization idioms.
206   bool VectorizeSaturationIdiom(LoopNode* node,
207                                 HInstruction* instruction,
208                                 bool generate_code,
209                                 DataType::Type type,
210                                 uint64_t restrictions);
211   bool VectorizeHalvingAddIdiom(LoopNode* node,
212                                 HInstruction* instruction,
213                                 bool generate_code,
214                                 DataType::Type type,
215                                 uint64_t restrictions);
216   bool VectorizeSADIdiom(LoopNode* node,
217                          HInstruction* instruction,
218                          bool generate_code,
219                          DataType::Type type,
220                          uint64_t restrictions);
221   bool VectorizeDotProdIdiom(LoopNode* node,
222                              HInstruction* instruction,
223                              bool generate_code,
224                              DataType::Type type,
225                              uint64_t restrictions);
226 
227   // Vectorization heuristics.
228   Alignment ComputeAlignment(HInstruction* offset,
229                              DataType::Type type,
230                              bool is_string_char_at,
231                              uint32_t peeling = 0);
232   void SetAlignmentStrategy(uint32_t peeling_votes[],
233                             const ArrayReference* peeling_candidate);
234   uint32_t MaxNumberPeeled();
235   bool IsVectorizationProfitable(int64_t trip_count);
236 
237   //
238   // Helpers.
239   //
240 
241   bool TrySetPhiInduction(HPhi* phi, bool restrict_uses);
242   bool TrySetPhiReduction(HPhi* phi);
243 
244   // Detects loop header with a single induction (returned in main_phi), possibly
245   // other phis for reductions, but no other side effects. Returns true on success.
246   bool TrySetSimpleLoopHeader(HBasicBlock* block, /*out*/ HPhi** main_phi);
247 
248   bool IsEmptyBody(HBasicBlock* block);
249   bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info,
250                            HInstruction* instruction,
251                            bool collect_loop_uses,
252                            /*out*/ uint32_t* use_count);
253   bool IsUsedOutsideLoop(HLoopInformation* loop_info,
254                          HInstruction* instruction);
255   bool TryReplaceWithLastValue(HLoopInformation* loop_info,
256                                HInstruction* instruction,
257                                HBasicBlock* block);
258   bool TryAssignLastValue(HLoopInformation* loop_info,
259                           HInstruction* instruction,
260                           HBasicBlock* block,
261                           bool collect_loop_uses);
262   void RemoveDeadInstructions(const HInstructionList& list);
263   bool CanRemoveCycle();  // Whether the current 'iset_' is removable.
264 
265   // Compiler options (to query ISA features).
266   const CompilerOptions* compiler_options_;
267 
268   // Range information based on prior induction variable analysis.
269   InductionVarRange induction_range_;
270 
271   // Phase-local heap memory allocator for the loop optimizer. Storage obtained
272   // through this allocator is immediately released when the loop optimizer is done.
273   ScopedArenaAllocator* loop_allocator_;
274 
275   // Global heap memory allocator. Used to build HIR.
276   ArenaAllocator* global_allocator_;
277 
278   // Entries into the loop hierarchy representation. The hierarchy resides
279   // in phase-local heap memory.
280   LoopNode* top_loop_;
281   LoopNode* last_loop_;
282 
283   // Temporary bookkeeping of a set of instructions.
284   // Contents reside in phase-local heap memory.
285   ScopedArenaSet<HInstruction*>* iset_;
286 
287   // Temporary bookkeeping of reduction instructions. Mapping is two-fold:
288   // (1) reductions in the loop-body are mapped back to their phi definition,
289   // (2) phi definitions are mapped to their initial value (updated during
290   //     code generation to feed the proper values into the new chain).
291   // Contents reside in phase-local heap memory.
292   ScopedArenaSafeMap<HInstruction*, HInstruction*>* reductions_;
293 
294   // Flag that tracks if any simplifications have occurred.
295   bool simplified_;
296 
297   // Number of "lanes" for selected packed type.
298   uint32_t vector_length_;
299 
300   // Set of array references in the vector loop.
301   // Contents reside in phase-local heap memory.
302   ScopedArenaSet<ArrayReference>* vector_refs_;
303 
304   // Static or dynamic loop peeling for alignment.
305   uint32_t vector_static_peeling_factor_;
306   const ArrayReference* vector_dynamic_peeling_candidate_;
307 
308   // Dynamic data dependence test of the form a != b.
309   HInstruction* vector_runtime_test_a_;
310   HInstruction* vector_runtime_test_b_;
311 
312   // Mapping used during vectorization synthesis for both the scalar peeling/cleanup
313   // loop (mode is kSequential) and the actual vector loop (mode is kVector). The data
314   // structure maps original instructions into the new instructions.
315   // Contents reside in phase-local heap memory.
316   ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_map_;
317 
318   // Permanent mapping used during vectorization synthesis.
319   // Contents reside in phase-local heap memory.
320   ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_permanent_map_;
321 
322   // Temporary vectorization bookkeeping.
323   VectorMode vector_mode_;  // synthesis mode
324   HBasicBlock* vector_preheader_;  // preheader of the new loop
325   HBasicBlock* vector_header_;  // header of the new loop
326   HBasicBlock* vector_body_;  // body of the new loop
327   HInstruction* vector_index_;  // normalized index of the new loop
328 
329   // Helper for target-specific behaviour for loop optimizations.
330   ArchNoOptsLoopHelper* arch_loop_helper_;
331 
332   friend class LoopOptimizationTest;
333 
334   DISALLOW_COPY_AND_ASSIGN(HLoopOptimization);
335 };
336 
337 }  // namespace art
338 
339 #endif  // ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
340