1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "intrinsics_x86_64.h"
18 
19 #include <limits>
20 
21 #include "arch/x86_64/instruction_set_features_x86_64.h"
22 #include "art_method.h"
23 #include "base/bit_utils.h"
24 #include "code_generator_x86_64.h"
25 #include "entrypoints/quick/quick_entrypoints.h"
26 #include "heap_poisoning.h"
27 #include "intrinsics.h"
28 #include "intrinsics_utils.h"
29 #include "lock_word.h"
30 #include "mirror/array-inl.h"
31 #include "mirror/object_array-inl.h"
32 #include "mirror/reference.h"
33 #include "mirror/string.h"
34 #include "scoped_thread_state_change-inl.h"
35 #include "thread-current-inl.h"
36 #include "utils/x86_64/assembler_x86_64.h"
37 #include "utils/x86_64/constants_x86_64.h"
38 
39 namespace art {
40 
41 namespace x86_64 {
42 
43 IntrinsicLocationsBuilderX86_64::IntrinsicLocationsBuilderX86_64(CodeGeneratorX86_64* codegen)
44   : allocator_(codegen->GetGraph()->GetAllocator()), codegen_(codegen) {
45 }
46 
47 X86_64Assembler* IntrinsicCodeGeneratorX86_64::GetAssembler() {
48   return down_cast<X86_64Assembler*>(codegen_->GetAssembler());
49 }
50 
51 ArenaAllocator* IntrinsicCodeGeneratorX86_64::GetAllocator() {
52   return codegen_->GetGraph()->GetAllocator();
53 }
54 
55 bool IntrinsicLocationsBuilderX86_64::TryDispatch(HInvoke* invoke) {
56   Dispatch(invoke);
57   LocationSummary* res = invoke->GetLocations();
58   if (res == nullptr) {
59     return false;
60   }
61   return res->Intrinsified();
62 }
63 
64 static void MoveArguments(HInvoke* invoke, CodeGeneratorX86_64* codegen) {
65   InvokeDexCallingConventionVisitorX86_64 calling_convention_visitor;
66   IntrinsicVisitor::MoveArguments(invoke, codegen, &calling_convention_visitor);
67 }
68 
69 using IntrinsicSlowPathX86_64 = IntrinsicSlowPath<InvokeDexCallingConventionVisitorX86_64>;
70 
71 // NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
72 #define __ down_cast<X86_64Assembler*>(codegen->GetAssembler())->  // NOLINT
73 
74 // Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers.
75 class ReadBarrierSystemArrayCopySlowPathX86_64 : public SlowPathCode {
76  public:
77   explicit ReadBarrierSystemArrayCopySlowPathX86_64(HInstruction* instruction)
78       : SlowPathCode(instruction) {
79     DCHECK(kEmitCompilerReadBarrier);
80     DCHECK(kUseBakerReadBarrier);
81   }
82 
83   void EmitNativeCode(CodeGenerator* codegen) override {
84     CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
85     LocationSummary* locations = instruction_->GetLocations();
86     DCHECK(locations->CanCall());
87     DCHECK(instruction_->IsInvokeStaticOrDirect())
88         << "Unexpected instruction in read barrier arraycopy slow path: "
89         << instruction_->DebugName();
90     DCHECK(instruction_->GetLocations()->Intrinsified());
91     DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy);
92 
93     int32_t element_size = DataType::Size(DataType::Type::kReference);
94 
95     CpuRegister src_curr_addr = locations->GetTemp(0).AsRegister<CpuRegister>();
96     CpuRegister dst_curr_addr = locations->GetTemp(1).AsRegister<CpuRegister>();
97     CpuRegister src_stop_addr = locations->GetTemp(2).AsRegister<CpuRegister>();
98 
99     __ Bind(GetEntryLabel());
100     NearLabel loop;
101     __ Bind(&loop);
102     __ movl(CpuRegister(TMP), Address(src_curr_addr, 0));
103     __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
104     // TODO: Inline the mark bit check before calling the runtime?
105     // TMP = ReadBarrier::Mark(TMP);
106     // No need to save live registers; it's taken care of by the
107     // entrypoint. Also, there is no need to update the stack mask,
108     // as this runtime call will not trigger a garbage collection.
109     int32_t entry_point_offset = Thread::ReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(TMP);
110     // This runtime call does not require a stack map.
111     x86_64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
112     __ MaybePoisonHeapReference(CpuRegister(TMP));
113     __ movl(Address(dst_curr_addr, 0), CpuRegister(TMP));
114     __ addl(src_curr_addr, Immediate(element_size));
115     __ addl(dst_curr_addr, Immediate(element_size));
116     __ cmpl(src_curr_addr, src_stop_addr);
117     __ j(kNotEqual, &loop);
118     __ jmp(GetExitLabel());
119   }
120 
121   const char* GetDescription() const override { return "ReadBarrierSystemArrayCopySlowPathX86_64"; }
122 
123  private:
124   DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathX86_64);
125 };
126 
127 #undef __
128 
129 #define __ assembler->
130 
131 static void CreateFPToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
132   LocationSummary* locations =
133       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
134   locations->SetInAt(0, Location::RequiresFpuRegister());
135   locations->SetOut(Location::RequiresRegister());
136 }
137 
138 static void CreateIntToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) {
139   LocationSummary* locations =
140       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
141   locations->SetInAt(0, Location::RequiresRegister());
142   locations->SetOut(Location::RequiresFpuRegister());
143 }
144 
145 static void MoveFPToInt(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) {
146   Location input = locations->InAt(0);
147   Location output = locations->Out();
148   __ movd(output.AsRegister<CpuRegister>(), input.AsFpuRegister<XmmRegister>(), is64bit);
149 }
150 
151 static void MoveIntToFP(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) {
152   Location input = locations->InAt(0);
153   Location output = locations->Out();
154   __ movd(output.AsFpuRegister<XmmRegister>(), input.AsRegister<CpuRegister>(), is64bit);
155 }
156 
157 void IntrinsicLocationsBuilderX86_64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
158   CreateFPToIntLocations(allocator_, invoke);
159 }
160 void IntrinsicLocationsBuilderX86_64::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
161   CreateIntToFPLocations(allocator_, invoke);
162 }
163 
164 void IntrinsicCodeGeneratorX86_64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
165   MoveFPToInt(invoke->GetLocations(), /* is64bit= */ true, GetAssembler());
166 }
167 void IntrinsicCodeGeneratorX86_64::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
168   MoveIntToFP(invoke->GetLocations(), /* is64bit= */ true, GetAssembler());
169 }
170 
171 void IntrinsicLocationsBuilderX86_64::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
172   CreateFPToIntLocations(allocator_, invoke);
173 }
174 void IntrinsicLocationsBuilderX86_64::VisitFloatIntBitsToFloat(HInvoke* invoke) {
175   CreateIntToFPLocations(allocator_, invoke);
176 }
177 
178 void IntrinsicCodeGeneratorX86_64::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
179   MoveFPToInt(invoke->GetLocations(), /* is64bit= */ false, GetAssembler());
180 }
181 void IntrinsicCodeGeneratorX86_64::VisitFloatIntBitsToFloat(HInvoke* invoke) {
182   MoveIntToFP(invoke->GetLocations(), /* is64bit= */ false, GetAssembler());
183 }
184 
185 static void CreateIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
186   LocationSummary* locations =
187       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
188   locations->SetInAt(0, Location::RequiresRegister());
189   locations->SetOut(Location::SameAsFirstInput());
190 }
191 
192 static void GenReverseBytes(LocationSummary* locations,
193                             DataType::Type size,
194                             X86_64Assembler* assembler) {
195   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
196 
197   switch (size) {
198     case DataType::Type::kInt16:
199       // TODO: Can be done with an xchg of 8b registers. This is straight from Quick.
200       __ bswapl(out);
201       __ sarl(out, Immediate(16));
202       break;
203     case DataType::Type::kInt32:
204       __ bswapl(out);
205       break;
206     case DataType::Type::kInt64:
207       __ bswapq(out);
208       break;
209     default:
210       LOG(FATAL) << "Unexpected size for reverse-bytes: " << size;
211       UNREACHABLE();
212   }
213 }
214 
215 void IntrinsicLocationsBuilderX86_64::VisitIntegerReverseBytes(HInvoke* invoke) {
216   CreateIntToIntLocations(allocator_, invoke);
217 }
218 
219 void IntrinsicCodeGeneratorX86_64::VisitIntegerReverseBytes(HInvoke* invoke) {
220   GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt32, GetAssembler());
221 }
222 
223 void IntrinsicLocationsBuilderX86_64::VisitLongReverseBytes(HInvoke* invoke) {
224   CreateIntToIntLocations(allocator_, invoke);
225 }
226 
227 void IntrinsicCodeGeneratorX86_64::VisitLongReverseBytes(HInvoke* invoke) {
228   GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt64, GetAssembler());
229 }
230 
231 void IntrinsicLocationsBuilderX86_64::VisitShortReverseBytes(HInvoke* invoke) {
232   CreateIntToIntLocations(allocator_, invoke);
233 }
234 
235 void IntrinsicCodeGeneratorX86_64::VisitShortReverseBytes(HInvoke* invoke) {
236   GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt16, GetAssembler());
237 }
238 
239 static void CreateFPToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) {
240   LocationSummary* locations =
241       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
242   locations->SetInAt(0, Location::RequiresFpuRegister());
243   locations->SetOut(Location::RequiresFpuRegister());
244 }
245 
246 void IntrinsicLocationsBuilderX86_64::VisitMathSqrt(HInvoke* invoke) {
247   CreateFPToFPLocations(allocator_, invoke);
248 }
249 
250 void IntrinsicCodeGeneratorX86_64::VisitMathSqrt(HInvoke* invoke) {
251   LocationSummary* locations = invoke->GetLocations();
252   XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
253   XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
254 
255   GetAssembler()->sqrtsd(out, in);
256 }
257 
258 static void InvokeOutOfLineIntrinsic(CodeGeneratorX86_64* codegen, HInvoke* invoke) {
259   MoveArguments(invoke, codegen);
260 
261   DCHECK(invoke->IsInvokeStaticOrDirect());
262   codegen->GenerateStaticOrDirectCall(
263       invoke->AsInvokeStaticOrDirect(), Location::RegisterLocation(RDI));
264 
265   // Copy the result back to the expected output.
266   Location out = invoke->GetLocations()->Out();
267   if (out.IsValid()) {
268     DCHECK(out.IsRegister());
269     codegen->MoveFromReturnRegister(out, invoke->GetType());
270   }
271 }
272 
273 static void CreateSSE41FPToFPLocations(ArenaAllocator* allocator,
274                                        HInvoke* invoke,
275                                        CodeGeneratorX86_64* codegen) {
276   // Do we have instruction support?
277   if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
278     CreateFPToFPLocations(allocator, invoke);
279     return;
280   }
281 
282   // We have to fall back to a call to the intrinsic.
283   LocationSummary* locations =
284       new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly);
285   InvokeRuntimeCallingConvention calling_convention;
286   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
287   locations->SetOut(Location::FpuRegisterLocation(XMM0));
288   // Needs to be RDI for the invoke.
289   locations->AddTemp(Location::RegisterLocation(RDI));
290 }
291 
292 static void GenSSE41FPToFPIntrinsic(CodeGeneratorX86_64* codegen,
293                                    HInvoke* invoke,
294                                    X86_64Assembler* assembler,
295                                    int round_mode) {
296   LocationSummary* locations = invoke->GetLocations();
297   if (locations->WillCall()) {
298     InvokeOutOfLineIntrinsic(codegen, invoke);
299   } else {
300     XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
301     XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
302     __ roundsd(out, in, Immediate(round_mode));
303   }
304 }
305 
306 void IntrinsicLocationsBuilderX86_64::VisitMathCeil(HInvoke* invoke) {
307   CreateSSE41FPToFPLocations(allocator_, invoke, codegen_);
308 }
309 
310 void IntrinsicCodeGeneratorX86_64::VisitMathCeil(HInvoke* invoke) {
311   GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 2);
312 }
313 
314 void IntrinsicLocationsBuilderX86_64::VisitMathFloor(HInvoke* invoke) {
315   CreateSSE41FPToFPLocations(allocator_, invoke, codegen_);
316 }
317 
318 void IntrinsicCodeGeneratorX86_64::VisitMathFloor(HInvoke* invoke) {
319   GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 1);
320 }
321 
322 void IntrinsicLocationsBuilderX86_64::VisitMathRint(HInvoke* invoke) {
323   CreateSSE41FPToFPLocations(allocator_, invoke, codegen_);
324 }
325 
326 void IntrinsicCodeGeneratorX86_64::VisitMathRint(HInvoke* invoke) {
327   GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 0);
328 }
329 
330 static void CreateSSE41FPToIntLocations(ArenaAllocator* allocator,
331                                         HInvoke* invoke,
332                                         CodeGeneratorX86_64* codegen) {
333   // Do we have instruction support?
334   if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
335     LocationSummary* locations =
336         new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
337     locations->SetInAt(0, Location::RequiresFpuRegister());
338     locations->SetOut(Location::RequiresRegister());
339     locations->AddTemp(Location::RequiresFpuRegister());
340     locations->AddTemp(Location::RequiresFpuRegister());
341     return;
342   }
343 
344   // We have to fall back to a call to the intrinsic.
345   LocationSummary* locations =
346       new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly);
347   InvokeRuntimeCallingConvention calling_convention;
348   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
349   locations->SetOut(Location::RegisterLocation(RAX));
350   // Needs to be RDI for the invoke.
351   locations->AddTemp(Location::RegisterLocation(RDI));
352 }
353 
354 void IntrinsicLocationsBuilderX86_64::VisitMathRoundFloat(HInvoke* invoke) {
355   CreateSSE41FPToIntLocations(allocator_, invoke, codegen_);
356 }
357 
358 void IntrinsicCodeGeneratorX86_64::VisitMathRoundFloat(HInvoke* invoke) {
359   LocationSummary* locations = invoke->GetLocations();
360   if (locations->WillCall()) {
361     InvokeOutOfLineIntrinsic(codegen_, invoke);
362     return;
363   }
364 
365   XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
366   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
367   XmmRegister t1 = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
368   XmmRegister t2 = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
369   NearLabel skip_incr, done;
370   X86_64Assembler* assembler = GetAssembler();
371 
372   // Since no direct x86 rounding instruction matches the required semantics,
373   // this intrinsic is implemented as follows:
374   //  result = floor(in);
375   //  if (in - result >= 0.5f)
376   //    result = result + 1.0f;
377   __ movss(t2, in);
378   __ roundss(t1, in, Immediate(1));
379   __ subss(t2, t1);
380   __ comiss(t2, codegen_->LiteralFloatAddress(0.5f));
381   __ j(kBelow, &skip_incr);
382   __ addss(t1, codegen_->LiteralFloatAddress(1.0f));
383   __ Bind(&skip_incr);
384 
385   // Final conversion to an integer. Unfortunately this also does not have a
386   // direct x86 instruction, since NaN should map to 0 and large positive
387   // values need to be clipped to the extreme value.
388   codegen_->Load32BitValue(out, kPrimIntMax);
389   __ cvtsi2ss(t2, out);
390   __ comiss(t1, t2);
391   __ j(kAboveEqual, &done);  // clipped to max (already in out), does not jump on unordered
392   __ movl(out, Immediate(0));  // does not change flags
393   __ j(kUnordered, &done);  // NaN mapped to 0 (just moved in out)
394   __ cvttss2si(out, t1);
395   __ Bind(&done);
396 }
397 
398 void IntrinsicLocationsBuilderX86_64::VisitMathRoundDouble(HInvoke* invoke) {
399   CreateSSE41FPToIntLocations(allocator_, invoke, codegen_);
400 }
401 
402 void IntrinsicCodeGeneratorX86_64::VisitMathRoundDouble(HInvoke* invoke) {
403   LocationSummary* locations = invoke->GetLocations();
404   if (locations->WillCall()) {
405     InvokeOutOfLineIntrinsic(codegen_, invoke);
406     return;
407   }
408 
409   XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
410   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
411   XmmRegister t1 = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
412   XmmRegister t2 = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
413   NearLabel skip_incr, done;
414   X86_64Assembler* assembler = GetAssembler();
415 
416   // Since no direct x86 rounding instruction matches the required semantics,
417   // this intrinsic is implemented as follows:
418   //  result = floor(in);
419   //  if (in - result >= 0.5)
420   //    result = result + 1.0f;
421   __ movsd(t2, in);
422   __ roundsd(t1, in, Immediate(1));
423   __ subsd(t2, t1);
424   __ comisd(t2, codegen_->LiteralDoubleAddress(0.5));
425   __ j(kBelow, &skip_incr);
426   __ addsd(t1, codegen_->LiteralDoubleAddress(1.0f));
427   __ Bind(&skip_incr);
428 
429   // Final conversion to an integer. Unfortunately this also does not have a
430   // direct x86 instruction, since NaN should map to 0 and large positive
431   // values need to be clipped to the extreme value.
432   codegen_->Load64BitValue(out, kPrimLongMax);
433   __ cvtsi2sd(t2, out, /* is64bit= */ true);
434   __ comisd(t1, t2);
435   __ j(kAboveEqual, &done);  // clipped to max (already in out), does not jump on unordered
436   __ movl(out, Immediate(0));  // does not change flags, implicit zero extension to 64-bit
437   __ j(kUnordered, &done);  // NaN mapped to 0 (just moved in out)
438   __ cvttsd2si(out, t1, /* is64bit= */ true);
439   __ Bind(&done);
440 }
441 
442 static void CreateFPToFPCallLocations(ArenaAllocator* allocator, HInvoke* invoke) {
443   LocationSummary* locations =
444       new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
445   InvokeRuntimeCallingConvention calling_convention;
446   locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
447   locations->SetOut(Location::FpuRegisterLocation(XMM0));
448 
449   // We have to ensure that the native code doesn't clobber the XMM registers which are
450   // non-volatile for ART, but volatile for Native calls.  This will ensure that they are
451   // saved in the prologue and properly restored.
452   for (FloatRegister fp_reg : non_volatile_xmm_regs) {
453     locations->AddTemp(Location::FpuRegisterLocation(fp_reg));
454   }
455 }
456 
457 static void GenFPToFPCall(HInvoke* invoke, CodeGeneratorX86_64* codegen,
458                           QuickEntrypointEnum entry) {
459   LocationSummary* locations = invoke->GetLocations();
460   DCHECK(locations->WillCall());
461   DCHECK(invoke->IsInvokeStaticOrDirect());
462 
463   codegen->InvokeRuntime(entry, invoke, invoke->GetDexPc());
464 }
465 
466 void IntrinsicLocationsBuilderX86_64::VisitMathCos(HInvoke* invoke) {
467   CreateFPToFPCallLocations(allocator_, invoke);
468 }
469 
470 void IntrinsicCodeGeneratorX86_64::VisitMathCos(HInvoke* invoke) {
471   GenFPToFPCall(invoke, codegen_, kQuickCos);
472 }
473 
474 void IntrinsicLocationsBuilderX86_64::VisitMathSin(HInvoke* invoke) {
475   CreateFPToFPCallLocations(allocator_, invoke);
476 }
477 
478 void IntrinsicCodeGeneratorX86_64::VisitMathSin(HInvoke* invoke) {
479   GenFPToFPCall(invoke, codegen_, kQuickSin);
480 }
481 
482 void IntrinsicLocationsBuilderX86_64::VisitMathAcos(HInvoke* invoke) {
483   CreateFPToFPCallLocations(allocator_, invoke);
484 }
485 
486 void IntrinsicCodeGeneratorX86_64::VisitMathAcos(HInvoke* invoke) {
487   GenFPToFPCall(invoke, codegen_, kQuickAcos);
488 }
489 
490 void IntrinsicLocationsBuilderX86_64::VisitMathAsin(HInvoke* invoke) {
491   CreateFPToFPCallLocations(allocator_, invoke);
492 }
493 
494 void IntrinsicCodeGeneratorX86_64::VisitMathAsin(HInvoke* invoke) {
495   GenFPToFPCall(invoke, codegen_, kQuickAsin);
496 }
497 
498 void IntrinsicLocationsBuilderX86_64::VisitMathAtan(HInvoke* invoke) {
499   CreateFPToFPCallLocations(allocator_, invoke);
500 }
501 
502 void IntrinsicCodeGeneratorX86_64::VisitMathAtan(HInvoke* invoke) {
503   GenFPToFPCall(invoke, codegen_, kQuickAtan);
504 }
505 
506 void IntrinsicLocationsBuilderX86_64::VisitMathCbrt(HInvoke* invoke) {
507   CreateFPToFPCallLocations(allocator_, invoke);
508 }
509 
510 void IntrinsicCodeGeneratorX86_64::VisitMathCbrt(HInvoke* invoke) {
511   GenFPToFPCall(invoke, codegen_, kQuickCbrt);
512 }
513 
514 void IntrinsicLocationsBuilderX86_64::VisitMathCosh(HInvoke* invoke) {
515   CreateFPToFPCallLocations(allocator_, invoke);
516 }
517 
518 void IntrinsicCodeGeneratorX86_64::VisitMathCosh(HInvoke* invoke) {
519   GenFPToFPCall(invoke, codegen_, kQuickCosh);
520 }
521 
522 void IntrinsicLocationsBuilderX86_64::VisitMathExp(HInvoke* invoke) {
523   CreateFPToFPCallLocations(allocator_, invoke);
524 }
525 
526 void IntrinsicCodeGeneratorX86_64::VisitMathExp(HInvoke* invoke) {
527   GenFPToFPCall(invoke, codegen_, kQuickExp);
528 }
529 
530 void IntrinsicLocationsBuilderX86_64::VisitMathExpm1(HInvoke* invoke) {
531   CreateFPToFPCallLocations(allocator_, invoke);
532 }
533 
534 void IntrinsicCodeGeneratorX86_64::VisitMathExpm1(HInvoke* invoke) {
535   GenFPToFPCall(invoke, codegen_, kQuickExpm1);
536 }
537 
538 void IntrinsicLocationsBuilderX86_64::VisitMathLog(HInvoke* invoke) {
539   CreateFPToFPCallLocations(allocator_, invoke);
540 }
541 
542 void IntrinsicCodeGeneratorX86_64::VisitMathLog(HInvoke* invoke) {
543   GenFPToFPCall(invoke, codegen_, kQuickLog);
544 }
545 
546 void IntrinsicLocationsBuilderX86_64::VisitMathLog10(HInvoke* invoke) {
547   CreateFPToFPCallLocations(allocator_, invoke);
548 }
549 
550 void IntrinsicCodeGeneratorX86_64::VisitMathLog10(HInvoke* invoke) {
551   GenFPToFPCall(invoke, codegen_, kQuickLog10);
552 }
553 
554 void IntrinsicLocationsBuilderX86_64::VisitMathSinh(HInvoke* invoke) {
555   CreateFPToFPCallLocations(allocator_, invoke);
556 }
557 
558 void IntrinsicCodeGeneratorX86_64::VisitMathSinh(HInvoke* invoke) {
559   GenFPToFPCall(invoke, codegen_, kQuickSinh);
560 }
561 
562 void IntrinsicLocationsBuilderX86_64::VisitMathTan(HInvoke* invoke) {
563   CreateFPToFPCallLocations(allocator_, invoke);
564 }
565 
566 void IntrinsicCodeGeneratorX86_64::VisitMathTan(HInvoke* invoke) {
567   GenFPToFPCall(invoke, codegen_, kQuickTan);
568 }
569 
570 void IntrinsicLocationsBuilderX86_64::VisitMathTanh(HInvoke* invoke) {
571   CreateFPToFPCallLocations(allocator_, invoke);
572 }
573 
574 void IntrinsicCodeGeneratorX86_64::VisitMathTanh(HInvoke* invoke) {
575   GenFPToFPCall(invoke, codegen_, kQuickTanh);
576 }
577 
578 static void CreateFPFPToFPCallLocations(ArenaAllocator* allocator, HInvoke* invoke) {
579   LocationSummary* locations =
580       new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
581   InvokeRuntimeCallingConvention calling_convention;
582   locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
583   locations->SetInAt(1, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(1)));
584   locations->SetOut(Location::FpuRegisterLocation(XMM0));
585 
586   // We have to ensure that the native code doesn't clobber the XMM registers which are
587   // non-volatile for ART, but volatile for Native calls.  This will ensure that they are
588   // saved in the prologue and properly restored.
589   for (FloatRegister fp_reg : non_volatile_xmm_regs) {
590     locations->AddTemp(Location::FpuRegisterLocation(fp_reg));
591   }
592 }
593 
594 void IntrinsicLocationsBuilderX86_64::VisitMathAtan2(HInvoke* invoke) {
595   CreateFPFPToFPCallLocations(allocator_, invoke);
596 }
597 
598 void IntrinsicCodeGeneratorX86_64::VisitMathAtan2(HInvoke* invoke) {
599   GenFPToFPCall(invoke, codegen_, kQuickAtan2);
600 }
601 
602 void IntrinsicLocationsBuilderX86_64::VisitMathPow(HInvoke* invoke) {
603   CreateFPFPToFPCallLocations(allocator_, invoke);
604 }
605 
606 void IntrinsicCodeGeneratorX86_64::VisitMathPow(HInvoke* invoke) {
607   GenFPToFPCall(invoke, codegen_, kQuickPow);
608 }
609 
610 void IntrinsicLocationsBuilderX86_64::VisitMathHypot(HInvoke* invoke) {
611   CreateFPFPToFPCallLocations(allocator_, invoke);
612 }
613 
614 void IntrinsicCodeGeneratorX86_64::VisitMathHypot(HInvoke* invoke) {
615   GenFPToFPCall(invoke, codegen_, kQuickHypot);
616 }
617 
618 void IntrinsicLocationsBuilderX86_64::VisitMathNextAfter(HInvoke* invoke) {
619   CreateFPFPToFPCallLocations(allocator_, invoke);
620 }
621 
622 void IntrinsicCodeGeneratorX86_64::VisitMathNextAfter(HInvoke* invoke) {
623   GenFPToFPCall(invoke, codegen_, kQuickNextAfter);
624 }
625 
626 void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) {
627   // Check to see if we have known failures that will cause us to have to bail out
628   // to the runtime, and just generate the runtime call directly.
629   HIntConstant* src_pos = invoke->InputAt(1)->AsIntConstant();
630   HIntConstant* dest_pos = invoke->InputAt(3)->AsIntConstant();
631 
632   // The positions must be non-negative.
633   if ((src_pos != nullptr && src_pos->GetValue() < 0) ||
634       (dest_pos != nullptr && dest_pos->GetValue() < 0)) {
635     // We will have to fail anyways.
636     return;
637   }
638 
639   // The length must be > 0.
640   HIntConstant* length = invoke->InputAt(4)->AsIntConstant();
641   if (length != nullptr) {
642     int32_t len = length->GetValue();
643     if (len < 0) {
644       // Just call as normal.
645       return;
646     }
647   }
648 
649   LocationSummary* locations =
650       new (allocator_) LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified);
651   // arraycopy(Object src, int src_pos, Object dest, int dest_pos, int length).
652   locations->SetInAt(0, Location::RequiresRegister());
653   locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1)));
654   locations->SetInAt(2, Location::RequiresRegister());
655   locations->SetInAt(3, Location::RegisterOrConstant(invoke->InputAt(3)));
656   locations->SetInAt(4, Location::RegisterOrConstant(invoke->InputAt(4)));
657 
658   // And we need some temporaries.  We will use REP MOVSW, so we need fixed registers.
659   locations->AddTemp(Location::RegisterLocation(RSI));
660   locations->AddTemp(Location::RegisterLocation(RDI));
661   locations->AddTemp(Location::RegisterLocation(RCX));
662 }
663 
664 static void CheckPosition(X86_64Assembler* assembler,
665                           Location pos,
666                           CpuRegister input,
667                           Location length,
668                           SlowPathCode* slow_path,
669                           CpuRegister temp,
670                           bool length_is_input_length = false) {
671   // Where is the length in the Array?
672   const uint32_t length_offset = mirror::Array::LengthOffset().Uint32Value();
673 
674   if (pos.IsConstant()) {
675     int32_t pos_const = pos.GetConstant()->AsIntConstant()->GetValue();
676     if (pos_const == 0) {
677       if (!length_is_input_length) {
678         // Check that length(input) >= length.
679         if (length.IsConstant()) {
680           __ cmpl(Address(input, length_offset),
681                   Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
682         } else {
683           __ cmpl(Address(input, length_offset), length.AsRegister<CpuRegister>());
684         }
685         __ j(kLess, slow_path->GetEntryLabel());
686       }
687     } else {
688       // Check that length(input) >= pos.
689       __ movl(temp, Address(input, length_offset));
690       __ subl(temp, Immediate(pos_const));
691       __ j(kLess, slow_path->GetEntryLabel());
692 
693       // Check that (length(input) - pos) >= length.
694       if (length.IsConstant()) {
695         __ cmpl(temp, Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
696       } else {
697         __ cmpl(temp, length.AsRegister<CpuRegister>());
698       }
699       __ j(kLess, slow_path->GetEntryLabel());
700     }
701   } else if (length_is_input_length) {
702     // The only way the copy can succeed is if pos is zero.
703     CpuRegister pos_reg = pos.AsRegister<CpuRegister>();
704     __ testl(pos_reg, pos_reg);
705     __ j(kNotEqual, slow_path->GetEntryLabel());
706   } else {
707     // Check that pos >= 0.
708     CpuRegister pos_reg = pos.AsRegister<CpuRegister>();
709     __ testl(pos_reg, pos_reg);
710     __ j(kLess, slow_path->GetEntryLabel());
711 
712     // Check that pos <= length(input).
713     __ cmpl(Address(input, length_offset), pos_reg);
714     __ j(kLess, slow_path->GetEntryLabel());
715 
716     // Check that (length(input) - pos) >= length.
717     __ movl(temp, Address(input, length_offset));
718     __ subl(temp, pos_reg);
719     if (length.IsConstant()) {
720       __ cmpl(temp, Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
721     } else {
722       __ cmpl(temp, length.AsRegister<CpuRegister>());
723     }
724     __ j(kLess, slow_path->GetEntryLabel());
725   }
726 }
727 
728 void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) {
729   X86_64Assembler* assembler = GetAssembler();
730   LocationSummary* locations = invoke->GetLocations();
731 
732   CpuRegister src = locations->InAt(0).AsRegister<CpuRegister>();
733   Location src_pos = locations->InAt(1);
734   CpuRegister dest = locations->InAt(2).AsRegister<CpuRegister>();
735   Location dest_pos = locations->InAt(3);
736   Location length = locations->InAt(4);
737 
738   // Temporaries that we need for MOVSW.
739   CpuRegister src_base = locations->GetTemp(0).AsRegister<CpuRegister>();
740   DCHECK_EQ(src_base.AsRegister(), RSI);
741   CpuRegister dest_base = locations->GetTemp(1).AsRegister<CpuRegister>();
742   DCHECK_EQ(dest_base.AsRegister(), RDI);
743   CpuRegister count = locations->GetTemp(2).AsRegister<CpuRegister>();
744   DCHECK_EQ(count.AsRegister(), RCX);
745 
746   SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke);
747   codegen_->AddSlowPath(slow_path);
748 
749   // Bail out if the source and destination are the same.
750   __ cmpl(src, dest);
751   __ j(kEqual, slow_path->GetEntryLabel());
752 
753   // Bail out if the source is null.
754   __ testl(src, src);
755   __ j(kEqual, slow_path->GetEntryLabel());
756 
757   // Bail out if the destination is null.
758   __ testl(dest, dest);
759   __ j(kEqual, slow_path->GetEntryLabel());
760 
761   // If the length is negative, bail out.
762   // We have already checked in the LocationsBuilder for the constant case.
763   if (!length.IsConstant()) {
764     __ testl(length.AsRegister<CpuRegister>(), length.AsRegister<CpuRegister>());
765     __ j(kLess, slow_path->GetEntryLabel());
766   }
767 
768   // Validity checks: source. Use src_base as a temporary register.
769   CheckPosition(assembler, src_pos, src, length, slow_path, src_base);
770 
771   // Validity checks: dest. Use src_base as a temporary register.
772   CheckPosition(assembler, dest_pos, dest, length, slow_path, src_base);
773 
774   // We need the count in RCX.
775   if (length.IsConstant()) {
776     __ movl(count, Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
777   } else {
778     __ movl(count, length.AsRegister<CpuRegister>());
779   }
780 
781   // Okay, everything checks out.  Finally time to do the copy.
782   // Check assumption that sizeof(Char) is 2 (used in scaling below).
783   const size_t char_size = DataType::Size(DataType::Type::kUint16);
784   DCHECK_EQ(char_size, 2u);
785 
786   const uint32_t data_offset = mirror::Array::DataOffset(char_size).Uint32Value();
787 
788   if (src_pos.IsConstant()) {
789     int32_t src_pos_const = src_pos.GetConstant()->AsIntConstant()->GetValue();
790     __ leal(src_base, Address(src, char_size * src_pos_const + data_offset));
791   } else {
792     __ leal(src_base, Address(src, src_pos.AsRegister<CpuRegister>(),
793                               ScaleFactor::TIMES_2, data_offset));
794   }
795   if (dest_pos.IsConstant()) {
796     int32_t dest_pos_const = dest_pos.GetConstant()->AsIntConstant()->GetValue();
797     __ leal(dest_base, Address(dest, char_size * dest_pos_const + data_offset));
798   } else {
799     __ leal(dest_base, Address(dest, dest_pos.AsRegister<CpuRegister>(),
800                                ScaleFactor::TIMES_2, data_offset));
801   }
802 
803   // Do the move.
804   __ rep_movsw();
805 
806   __ Bind(slow_path->GetExitLabel());
807 }
808 
809 
810 void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
811   // The only read barrier implementation supporting the
812   // SystemArrayCopy intrinsic is the Baker-style read barriers.
813   if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
814     return;
815   }
816 
817   CodeGenerator::CreateSystemArrayCopyLocationSummary(invoke);
818 }
819 
820 // Compute base source address, base destination address, and end
821 // source address for the System.arraycopy intrinsic in `src_base`,
822 // `dst_base` and `src_end` respectively.
823 static void GenSystemArrayCopyAddresses(X86_64Assembler* assembler,
824                                         DataType::Type type,
825                                         const CpuRegister& src,
826                                         const Location& src_pos,
827                                         const CpuRegister& dst,
828                                         const Location& dst_pos,
829                                         const Location& copy_length,
830                                         const CpuRegister& src_base,
831                                         const CpuRegister& dst_base,
832                                         const CpuRegister& src_end) {
833   // This routine is only used by the SystemArrayCopy intrinsic.
834   DCHECK_EQ(type, DataType::Type::kReference);
835   const int32_t element_size = DataType::Size(type);
836   const ScaleFactor scale_factor = static_cast<ScaleFactor>(DataType::SizeShift(type));
837   const uint32_t data_offset = mirror::Array::DataOffset(element_size).Uint32Value();
838 
839   if (src_pos.IsConstant()) {
840     int32_t constant = src_pos.GetConstant()->AsIntConstant()->GetValue();
841     __ leal(src_base, Address(src, element_size * constant + data_offset));
842   } else {
843     __ leal(src_base, Address(src, src_pos.AsRegister<CpuRegister>(), scale_factor, data_offset));
844   }
845 
846   if (dst_pos.IsConstant()) {
847     int32_t constant = dst_pos.GetConstant()->AsIntConstant()->GetValue();
848     __ leal(dst_base, Address(dst, element_size * constant + data_offset));
849   } else {
850     __ leal(dst_base, Address(dst, dst_pos.AsRegister<CpuRegister>(), scale_factor, data_offset));
851   }
852 
853   if (copy_length.IsConstant()) {
854     int32_t constant = copy_length.GetConstant()->AsIntConstant()->GetValue();
855     __ leal(src_end, Address(src_base, element_size * constant));
856   } else {
857     __ leal(src_end, Address(src_base, copy_length.AsRegister<CpuRegister>(), scale_factor, 0));
858   }
859 }
860 
861 void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
862   // The only read barrier implementation supporting the
863   // SystemArrayCopy intrinsic is the Baker-style read barriers.
864   DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
865 
866   X86_64Assembler* assembler = GetAssembler();
867   LocationSummary* locations = invoke->GetLocations();
868 
869   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
870   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
871   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
872   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
873   uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
874 
875   CpuRegister src = locations->InAt(0).AsRegister<CpuRegister>();
876   Location src_pos = locations->InAt(1);
877   CpuRegister dest = locations->InAt(2).AsRegister<CpuRegister>();
878   Location dest_pos = locations->InAt(3);
879   Location length = locations->InAt(4);
880   Location temp1_loc = locations->GetTemp(0);
881   CpuRegister temp1 = temp1_loc.AsRegister<CpuRegister>();
882   Location temp2_loc = locations->GetTemp(1);
883   CpuRegister temp2 = temp2_loc.AsRegister<CpuRegister>();
884   Location temp3_loc = locations->GetTemp(2);
885   CpuRegister temp3 = temp3_loc.AsRegister<CpuRegister>();
886   Location TMP_loc = Location::RegisterLocation(TMP);
887 
888   SlowPathCode* intrinsic_slow_path =
889       new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke);
890   codegen_->AddSlowPath(intrinsic_slow_path);
891 
892   NearLabel conditions_on_positions_validated;
893   SystemArrayCopyOptimizations optimizations(invoke);
894 
895   // If source and destination are the same, we go to slow path if we need to do
896   // forward copying.
897   if (src_pos.IsConstant()) {
898     int32_t src_pos_constant = src_pos.GetConstant()->AsIntConstant()->GetValue();
899     if (dest_pos.IsConstant()) {
900       int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
901       if (optimizations.GetDestinationIsSource()) {
902         // Checked when building locations.
903         DCHECK_GE(src_pos_constant, dest_pos_constant);
904       } else if (src_pos_constant < dest_pos_constant) {
905         __ cmpl(src, dest);
906         __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
907       }
908     } else {
909       if (!optimizations.GetDestinationIsSource()) {
910         __ cmpl(src, dest);
911         __ j(kNotEqual, &conditions_on_positions_validated);
912       }
913       __ cmpl(dest_pos.AsRegister<CpuRegister>(), Immediate(src_pos_constant));
914       __ j(kGreater, intrinsic_slow_path->GetEntryLabel());
915     }
916   } else {
917     if (!optimizations.GetDestinationIsSource()) {
918       __ cmpl(src, dest);
919       __ j(kNotEqual, &conditions_on_positions_validated);
920     }
921     if (dest_pos.IsConstant()) {
922       int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
923       __ cmpl(src_pos.AsRegister<CpuRegister>(), Immediate(dest_pos_constant));
924       __ j(kLess, intrinsic_slow_path->GetEntryLabel());
925     } else {
926       __ cmpl(src_pos.AsRegister<CpuRegister>(), dest_pos.AsRegister<CpuRegister>());
927       __ j(kLess, intrinsic_slow_path->GetEntryLabel());
928     }
929   }
930 
931   __ Bind(&conditions_on_positions_validated);
932 
933   if (!optimizations.GetSourceIsNotNull()) {
934     // Bail out if the source is null.
935     __ testl(src, src);
936     __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
937   }
938 
939   if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) {
940     // Bail out if the destination is null.
941     __ testl(dest, dest);
942     __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
943   }
944 
945   // If the length is negative, bail out.
946   // We have already checked in the LocationsBuilder for the constant case.
947   if (!length.IsConstant() &&
948       !optimizations.GetCountIsSourceLength() &&
949       !optimizations.GetCountIsDestinationLength()) {
950     __ testl(length.AsRegister<CpuRegister>(), length.AsRegister<CpuRegister>());
951     __ j(kLess, intrinsic_slow_path->GetEntryLabel());
952   }
953 
954   // Validity checks: source.
955   CheckPosition(assembler,
956                 src_pos,
957                 src,
958                 length,
959                 intrinsic_slow_path,
960                 temp1,
961                 optimizations.GetCountIsSourceLength());
962 
963   // Validity checks: dest.
964   CheckPosition(assembler,
965                 dest_pos,
966                 dest,
967                 length,
968                 intrinsic_slow_path,
969                 temp1,
970                 optimizations.GetCountIsDestinationLength());
971 
972   if (!optimizations.GetDoesNotNeedTypeCheck()) {
973     // Check whether all elements of the source array are assignable to the component
974     // type of the destination array. We do two checks: the classes are the same,
975     // or the destination is Object[]. If none of these checks succeed, we go to the
976     // slow path.
977 
978     bool did_unpoison = false;
979     if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
980       // /* HeapReference<Class> */ temp1 = dest->klass_
981       codegen_->GenerateFieldLoadWithBakerReadBarrier(
982           invoke, temp1_loc, dest, class_offset, /* needs_null_check= */ false);
983       // Register `temp1` is not trashed by the read barrier emitted
984       // by GenerateFieldLoadWithBakerReadBarrier below, as that
985       // method produces a call to a ReadBarrierMarkRegX entry point,
986       // which saves all potentially live registers, including
987       // temporaries such a `temp1`.
988       // /* HeapReference<Class> */ temp2 = src->klass_
989       codegen_->GenerateFieldLoadWithBakerReadBarrier(
990           invoke, temp2_loc, src, class_offset, /* needs_null_check= */ false);
991       // If heap poisoning is enabled, `temp1` and `temp2` have been
992       // unpoisoned by the the previous calls to
993       // GenerateFieldLoadWithBakerReadBarrier.
994     } else {
995       // /* HeapReference<Class> */ temp1 = dest->klass_
996       __ movl(temp1, Address(dest, class_offset));
997       // /* HeapReference<Class> */ temp2 = src->klass_
998       __ movl(temp2, Address(src, class_offset));
999       if (!optimizations.GetDestinationIsNonPrimitiveArray() ||
1000           !optimizations.GetSourceIsNonPrimitiveArray()) {
1001         // One or two of the references need to be unpoisoned. Unpoison them
1002         // both to make the identity check valid.
1003         __ MaybeUnpoisonHeapReference(temp1);
1004         __ MaybeUnpoisonHeapReference(temp2);
1005         did_unpoison = true;
1006       }
1007     }
1008 
1009     if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
1010       // Bail out if the destination is not a non primitive array.
1011       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
1012         // /* HeapReference<Class> */ TMP = temp1->component_type_
1013         codegen_->GenerateFieldLoadWithBakerReadBarrier(
1014             invoke, TMP_loc, temp1, component_offset, /* needs_null_check= */ false);
1015         __ testl(CpuRegister(TMP), CpuRegister(TMP));
1016         __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1017         // If heap poisoning is enabled, `TMP` has been unpoisoned by
1018         // the the previous call to GenerateFieldLoadWithBakerReadBarrier.
1019       } else {
1020         // /* HeapReference<Class> */ TMP = temp1->component_type_
1021         __ movl(CpuRegister(TMP), Address(temp1, component_offset));
1022         __ testl(CpuRegister(TMP), CpuRegister(TMP));
1023         __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1024         __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
1025       }
1026       __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
1027       __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
1028     }
1029 
1030     if (!optimizations.GetSourceIsNonPrimitiveArray()) {
1031       // Bail out if the source is not a non primitive array.
1032       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
1033         // For the same reason given earlier, `temp1` is not trashed by the
1034         // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below.
1035         // /* HeapReference<Class> */ TMP = temp2->component_type_
1036         codegen_->GenerateFieldLoadWithBakerReadBarrier(
1037             invoke, TMP_loc, temp2, component_offset, /* needs_null_check= */ false);
1038         __ testl(CpuRegister(TMP), CpuRegister(TMP));
1039         __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1040         // If heap poisoning is enabled, `TMP` has been unpoisoned by
1041         // the the previous call to GenerateFieldLoadWithBakerReadBarrier.
1042       } else {
1043         // /* HeapReference<Class> */ TMP = temp2->component_type_
1044         __ movl(CpuRegister(TMP), Address(temp2, component_offset));
1045         __ testl(CpuRegister(TMP), CpuRegister(TMP));
1046         __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1047         __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
1048       }
1049       __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
1050       __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
1051     }
1052 
1053     __ cmpl(temp1, temp2);
1054 
1055     if (optimizations.GetDestinationIsTypedObjectArray()) {
1056       NearLabel do_copy;
1057       __ j(kEqual, &do_copy);
1058       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
1059         // /* HeapReference<Class> */ temp1 = temp1->component_type_
1060         codegen_->GenerateFieldLoadWithBakerReadBarrier(
1061             invoke, temp1_loc, temp1, component_offset, /* needs_null_check= */ false);
1062         // We do not need to emit a read barrier for the following
1063         // heap reference load, as `temp1` is only used in a
1064         // comparison with null below, and this reference is not
1065         // kept afterwards.
1066         __ cmpl(Address(temp1, super_offset), Immediate(0));
1067       } else {
1068         if (!did_unpoison) {
1069           __ MaybeUnpoisonHeapReference(temp1);
1070         }
1071         // /* HeapReference<Class> */ temp1 = temp1->component_type_
1072         __ movl(temp1, Address(temp1, component_offset));
1073         __ MaybeUnpoisonHeapReference(temp1);
1074         // No need to unpoison the following heap reference load, as
1075         // we're comparing against null.
1076         __ cmpl(Address(temp1, super_offset), Immediate(0));
1077       }
1078       __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
1079       __ Bind(&do_copy);
1080     } else {
1081       __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
1082     }
1083   } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
1084     DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
1085     // Bail out if the source is not a non primitive array.
1086     if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
1087       // /* HeapReference<Class> */ temp1 = src->klass_
1088       codegen_->GenerateFieldLoadWithBakerReadBarrier(
1089           invoke, temp1_loc, src, class_offset, /* needs_null_check= */ false);
1090       // /* HeapReference<Class> */ TMP = temp1->component_type_
1091       codegen_->GenerateFieldLoadWithBakerReadBarrier(
1092           invoke, TMP_loc, temp1, component_offset, /* needs_null_check= */ false);
1093       __ testl(CpuRegister(TMP), CpuRegister(TMP));
1094       __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1095     } else {
1096       // /* HeapReference<Class> */ temp1 = src->klass_
1097       __ movl(temp1, Address(src, class_offset));
1098       __ MaybeUnpoisonHeapReference(temp1);
1099       // /* HeapReference<Class> */ TMP = temp1->component_type_
1100       __ movl(CpuRegister(TMP), Address(temp1, component_offset));
1101       // No need to unpoison `TMP` now, as we're comparing against null.
1102       __ testl(CpuRegister(TMP), CpuRegister(TMP));
1103       __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1104       __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
1105     }
1106     __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
1107     __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
1108   }
1109 
1110   const DataType::Type type = DataType::Type::kReference;
1111   const int32_t element_size = DataType::Size(type);
1112 
1113   // Compute base source address, base destination address, and end
1114   // source address in `temp1`, `temp2` and `temp3` respectively.
1115   GenSystemArrayCopyAddresses(
1116       GetAssembler(), type, src, src_pos, dest, dest_pos, length, temp1, temp2, temp3);
1117 
1118   if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
1119     // SystemArrayCopy implementation for Baker read barriers (see
1120     // also CodeGeneratorX86_64::GenerateReferenceLoadWithBakerReadBarrier):
1121     //
1122     //   if (src_ptr != end_ptr) {
1123     //     uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
1124     //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
1125     //     bool is_gray = (rb_state == ReadBarrier::GrayState());
1126     //     if (is_gray) {
1127     //       // Slow-path copy.
1128     //       do {
1129     //         *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++)));
1130     //       } while (src_ptr != end_ptr)
1131     //     } else {
1132     //       // Fast-path copy.
1133     //       do {
1134     //         *dest_ptr++ = *src_ptr++;
1135     //       } while (src_ptr != end_ptr)
1136     //     }
1137     //   }
1138 
1139     NearLabel loop, done;
1140 
1141     // Don't enter copy loop if `length == 0`.
1142     __ cmpl(temp1, temp3);
1143     __ j(kEqual, &done);
1144 
1145     // Given the numeric representation, it's enough to check the low bit of the rb_state.
1146     static_assert(ReadBarrier::NonGrayState() == 0, "Expecting non-gray to have value 0");
1147     static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
1148     constexpr uint32_t gray_byte_position = LockWord::kReadBarrierStateShift / kBitsPerByte;
1149     constexpr uint32_t gray_bit_position = LockWord::kReadBarrierStateShift % kBitsPerByte;
1150     constexpr int32_t test_value = static_cast<int8_t>(1 << gray_bit_position);
1151 
1152     // if (rb_state == ReadBarrier::GrayState())
1153     //   goto slow_path;
1154     // At this point, just do the "if" and make sure that flags are preserved until the branch.
1155     __ testb(Address(src, monitor_offset + gray_byte_position), Immediate(test_value));
1156 
1157     // Load fence to prevent load-load reordering.
1158     // Note that this is a no-op, thanks to the x86-64 memory model.
1159     codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
1160 
1161     // Slow path used to copy array when `src` is gray.
1162     SlowPathCode* read_barrier_slow_path =
1163         new (codegen_->GetScopedAllocator()) ReadBarrierSystemArrayCopySlowPathX86_64(invoke);
1164     codegen_->AddSlowPath(read_barrier_slow_path);
1165 
1166     // We have done the "if" of the gray bit check above, now branch based on the flags.
1167     __ j(kNotZero, read_barrier_slow_path->GetEntryLabel());
1168 
1169     // Fast-path copy.
1170     // Iterate over the arrays and do a raw copy of the objects. We don't need to
1171     // poison/unpoison.
1172     __ Bind(&loop);
1173     __ movl(CpuRegister(TMP), Address(temp1, 0));
1174     __ movl(Address(temp2, 0), CpuRegister(TMP));
1175     __ addl(temp1, Immediate(element_size));
1176     __ addl(temp2, Immediate(element_size));
1177     __ cmpl(temp1, temp3);
1178     __ j(kNotEqual, &loop);
1179 
1180     __ Bind(read_barrier_slow_path->GetExitLabel());
1181     __ Bind(&done);
1182   } else {
1183     // Non read barrier code.
1184 
1185     // Iterate over the arrays and do a raw copy of the objects. We don't need to
1186     // poison/unpoison.
1187     NearLabel loop, done;
1188     __ cmpl(temp1, temp3);
1189     __ j(kEqual, &done);
1190     __ Bind(&loop);
1191     __ movl(CpuRegister(TMP), Address(temp1, 0));
1192     __ movl(Address(temp2, 0), CpuRegister(TMP));
1193     __ addl(temp1, Immediate(element_size));
1194     __ addl(temp2, Immediate(element_size));
1195     __ cmpl(temp1, temp3);
1196     __ j(kNotEqual, &loop);
1197     __ Bind(&done);
1198   }
1199 
1200   // We only need one card marking on the destination array.
1201   codegen_->MarkGCCard(temp1, temp2, dest, CpuRegister(kNoRegister), /* value_can_be_null= */ false);
1202 
1203   __ Bind(intrinsic_slow_path->GetExitLabel());
1204 }
1205 
1206 void IntrinsicLocationsBuilderX86_64::VisitStringCompareTo(HInvoke* invoke) {
1207   LocationSummary* locations = new (allocator_) LocationSummary(
1208       invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
1209   InvokeRuntimeCallingConvention calling_convention;
1210   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
1211   locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
1212   locations->SetOut(Location::RegisterLocation(RAX));
1213 }
1214 
1215 void IntrinsicCodeGeneratorX86_64::VisitStringCompareTo(HInvoke* invoke) {
1216   X86_64Assembler* assembler = GetAssembler();
1217   LocationSummary* locations = invoke->GetLocations();
1218 
1219   // Note that the null check must have been done earlier.
1220   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
1221 
1222   CpuRegister argument = locations->InAt(1).AsRegister<CpuRegister>();
1223   __ testl(argument, argument);
1224   SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke);
1225   codegen_->AddSlowPath(slow_path);
1226   __ j(kEqual, slow_path->GetEntryLabel());
1227 
1228   codegen_->InvokeRuntime(kQuickStringCompareTo, invoke, invoke->GetDexPc(), slow_path);
1229   __ Bind(slow_path->GetExitLabel());
1230 }
1231 
1232 void IntrinsicLocationsBuilderX86_64::VisitStringEquals(HInvoke* invoke) {
1233   LocationSummary* locations =
1234       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
1235   locations->SetInAt(0, Location::RequiresRegister());
1236   locations->SetInAt(1, Location::RequiresRegister());
1237 
1238   // Request temporary registers, RCX and RDI needed for repe_cmpsq instruction.
1239   locations->AddTemp(Location::RegisterLocation(RCX));
1240   locations->AddTemp(Location::RegisterLocation(RDI));
1241 
1242   // Set output, RSI needed for repe_cmpsq instruction anyways.
1243   locations->SetOut(Location::RegisterLocation(RSI), Location::kOutputOverlap);
1244 }
1245 
1246 void IntrinsicCodeGeneratorX86_64::VisitStringEquals(HInvoke* invoke) {
1247   X86_64Assembler* assembler = GetAssembler();
1248   LocationSummary* locations = invoke->GetLocations();
1249 
1250   CpuRegister str = locations->InAt(0).AsRegister<CpuRegister>();
1251   CpuRegister arg = locations->InAt(1).AsRegister<CpuRegister>();
1252   CpuRegister rcx = locations->GetTemp(0).AsRegister<CpuRegister>();
1253   CpuRegister rdi = locations->GetTemp(1).AsRegister<CpuRegister>();
1254   CpuRegister rsi = locations->Out().AsRegister<CpuRegister>();
1255 
1256   NearLabel end, return_true, return_false;
1257 
1258   // Get offsets of count, value, and class fields within a string object.
1259   const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
1260   const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value();
1261   const uint32_t class_offset = mirror::Object::ClassOffset().Uint32Value();
1262 
1263   // Note that the null check must have been done earlier.
1264   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
1265 
1266   StringEqualsOptimizations optimizations(invoke);
1267   if (!optimizations.GetArgumentNotNull()) {
1268     // Check if input is null, return false if it is.
1269     __ testl(arg, arg);
1270     __ j(kEqual, &return_false);
1271   }
1272 
1273   if (!optimizations.GetArgumentIsString()) {
1274     // Instanceof check for the argument by comparing class fields.
1275     // All string objects must have the same type since String cannot be subclassed.
1276     // Receiver must be a string object, so its class field is equal to all strings' class fields.
1277     // If the argument is a string object, its class field must be equal to receiver's class field.
1278     //
1279     // As the String class is expected to be non-movable, we can read the class
1280     // field from String.equals' arguments without read barriers.
1281     AssertNonMovableStringClass();
1282     // Also, because we use the loaded class references only to compare them, we
1283     // don't need to unpoison them.
1284     // /* HeapReference<Class> */ rcx = str->klass_
1285     __ movl(rcx, Address(str, class_offset));
1286     // if (rcx != /* HeapReference<Class> */ arg->klass_) return false
1287     __ cmpl(rcx, Address(arg, class_offset));
1288     __ j(kNotEqual, &return_false);
1289   }
1290 
1291   // Reference equality check, return true if same reference.
1292   __ cmpl(str, arg);
1293   __ j(kEqual, &return_true);
1294 
1295   // Load length and compression flag of receiver string.
1296   __ movl(rcx, Address(str, count_offset));
1297   // Check if lengths and compressiond flags are equal, return false if they're not.
1298   // Two identical strings will always have same compression style since
1299   // compression style is decided on alloc.
1300   __ cmpl(rcx, Address(arg, count_offset));
1301   __ j(kNotEqual, &return_false);
1302   // Return true if both strings are empty. Even with string compression `count == 0` means empty.
1303   static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
1304                 "Expecting 0=compressed, 1=uncompressed");
1305   __ jrcxz(&return_true);
1306 
1307   if (mirror::kUseStringCompression) {
1308     NearLabel string_uncompressed;
1309     // Extract length and differentiate between both compressed or both uncompressed.
1310     // Different compression style is cut above.
1311     __ shrl(rcx, Immediate(1));
1312     __ j(kCarrySet, &string_uncompressed);
1313     // Divide string length by 2, rounding up, and continue as if uncompressed.
1314     // Merge clearing the compression flag with +1 for rounding.
1315     __ addl(rcx, Immediate(1));
1316     __ shrl(rcx, Immediate(1));
1317     __ Bind(&string_uncompressed);
1318   }
1319   // Load starting addresses of string values into RSI/RDI as required for repe_cmpsq instruction.
1320   __ leal(rsi, Address(str, value_offset));
1321   __ leal(rdi, Address(arg, value_offset));
1322 
1323   // Divide string length by 4 and adjust for lengths not divisible by 4.
1324   __ addl(rcx, Immediate(3));
1325   __ shrl(rcx, Immediate(2));
1326 
1327   // Assertions that must hold in order to compare strings 4 characters (uncompressed)
1328   // or 8 characters (compressed) at a time.
1329   DCHECK_ALIGNED(value_offset, 8);
1330   static_assert(IsAligned<8>(kObjectAlignment), "String is not zero padded");
1331 
1332   // Loop to compare strings four characters at a time starting at the beginning of the string.
1333   __ repe_cmpsq();
1334   // If strings are not equal, zero flag will be cleared.
1335   __ j(kNotEqual, &return_false);
1336 
1337   // Return true and exit the function.
1338   // If loop does not result in returning false, we return true.
1339   __ Bind(&return_true);
1340   __ movl(rsi, Immediate(1));
1341   __ jmp(&end);
1342 
1343   // Return false and exit the function.
1344   __ Bind(&return_false);
1345   __ xorl(rsi, rsi);
1346   __ Bind(&end);
1347 }
1348 
1349 static void CreateStringIndexOfLocations(HInvoke* invoke,
1350                                          ArenaAllocator* allocator,
1351                                          bool start_at_zero) {
1352   LocationSummary* locations = new (allocator) LocationSummary(invoke,
1353                                                                LocationSummary::kCallOnSlowPath,
1354                                                                kIntrinsified);
1355   // The data needs to be in RDI for scasw. So request that the string is there, anyways.
1356   locations->SetInAt(0, Location::RegisterLocation(RDI));
1357   // If we look for a constant char, we'll still have to copy it into RAX. So just request the
1358   // allocator to do that, anyways. We can still do the constant check by checking the parameter
1359   // of the instruction explicitly.
1360   // Note: This works as we don't clobber RAX anywhere.
1361   locations->SetInAt(1, Location::RegisterLocation(RAX));
1362   if (!start_at_zero) {
1363     locations->SetInAt(2, Location::RequiresRegister());          // The starting index.
1364   }
1365   // As we clobber RDI during execution anyways, also use it as the output.
1366   locations->SetOut(Location::SameAsFirstInput());
1367 
1368   // repne scasw uses RCX as the counter.
1369   locations->AddTemp(Location::RegisterLocation(RCX));
1370   // Need another temporary to be able to compute the result.
1371   locations->AddTemp(Location::RequiresRegister());
1372 }
1373 
1374 static void GenerateStringIndexOf(HInvoke* invoke,
1375                                   X86_64Assembler* assembler,
1376                                   CodeGeneratorX86_64* codegen,
1377                                   bool start_at_zero) {
1378   LocationSummary* locations = invoke->GetLocations();
1379 
1380   // Note that the null check must have been done earlier.
1381   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
1382 
1383   CpuRegister string_obj = locations->InAt(0).AsRegister<CpuRegister>();
1384   CpuRegister search_value = locations->InAt(1).AsRegister<CpuRegister>();
1385   CpuRegister counter = locations->GetTemp(0).AsRegister<CpuRegister>();
1386   CpuRegister string_length = locations->GetTemp(1).AsRegister<CpuRegister>();
1387   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
1388 
1389   // Check our assumptions for registers.
1390   DCHECK_EQ(string_obj.AsRegister(), RDI);
1391   DCHECK_EQ(search_value.AsRegister(), RAX);
1392   DCHECK_EQ(counter.AsRegister(), RCX);
1393   DCHECK_EQ(out.AsRegister(), RDI);
1394 
1395   // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically,
1396   // or directly dispatch for a large constant, or omit slow-path for a small constant or a char.
1397   SlowPathCode* slow_path = nullptr;
1398   HInstruction* code_point = invoke->InputAt(1);
1399   if (code_point->IsIntConstant()) {
1400     if (static_cast<uint32_t>(code_point->AsIntConstant()->GetValue()) >
1401     std::numeric_limits<uint16_t>::max()) {
1402       // Always needs the slow-path. We could directly dispatch to it, but this case should be
1403       // rare, so for simplicity just put the full slow-path down and branch unconditionally.
1404       slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke);
1405       codegen->AddSlowPath(slow_path);
1406       __ jmp(slow_path->GetEntryLabel());
1407       __ Bind(slow_path->GetExitLabel());
1408       return;
1409     }
1410   } else if (code_point->GetType() != DataType::Type::kUint16) {
1411     __ cmpl(search_value, Immediate(std::numeric_limits<uint16_t>::max()));
1412     slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke);
1413     codegen->AddSlowPath(slow_path);
1414     __ j(kAbove, slow_path->GetEntryLabel());
1415   }
1416 
1417   // From here down, we know that we are looking for a char that fits in
1418   // 16 bits (uncompressed) or 8 bits (compressed).
1419   // Location of reference to data array within the String object.
1420   int32_t value_offset = mirror::String::ValueOffset().Int32Value();
1421   // Location of count within the String object.
1422   int32_t count_offset = mirror::String::CountOffset().Int32Value();
1423 
1424   // Load the count field of the string containing the length and compression flag.
1425   __ movl(string_length, Address(string_obj, count_offset));
1426 
1427   // Do a zero-length check. Even with string compression `count == 0` means empty.
1428   // TODO: Support jecxz.
1429   NearLabel not_found_label;
1430   __ testl(string_length, string_length);
1431   __ j(kEqual, &not_found_label);
1432 
1433   if (mirror::kUseStringCompression) {
1434     // Use TMP to keep string_length_flagged.
1435     __ movl(CpuRegister(TMP), string_length);
1436     // Mask out first bit used as compression flag.
1437     __ shrl(string_length, Immediate(1));
1438   }
1439 
1440   if (start_at_zero) {
1441     // Number of chars to scan is the same as the string length.
1442     __ movl(counter, string_length);
1443     // Move to the start of the string.
1444     __ addq(string_obj, Immediate(value_offset));
1445   } else {
1446     CpuRegister start_index = locations->InAt(2).AsRegister<CpuRegister>();
1447 
1448     // Do a start_index check.
1449     __ cmpl(start_index, string_length);
1450     __ j(kGreaterEqual, &not_found_label);
1451 
1452     // Ensure we have a start index >= 0;
1453     __ xorl(counter, counter);
1454     __ cmpl(start_index, Immediate(0));
1455     __ cmov(kGreater, counter, start_index, /* is64bit= */ false);  // 32-bit copy is enough.
1456 
1457     if (mirror::kUseStringCompression) {
1458       NearLabel modify_counter, offset_uncompressed_label;
1459       __ testl(CpuRegister(TMP), Immediate(1));
1460       __ j(kNotZero, &offset_uncompressed_label);
1461       __ leaq(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_1, value_offset));
1462       __ jmp(&modify_counter);
1463       // Move to the start of the string: string_obj + value_offset + 2 * start_index.
1464       __ Bind(&offset_uncompressed_label);
1465       __ leaq(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_2, value_offset));
1466       __ Bind(&modify_counter);
1467     } else {
1468       __ leaq(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_2, value_offset));
1469     }
1470     // Now update ecx, the work counter: it's gonna be string.length - start_index.
1471     __ negq(counter);  // Needs to be 64-bit negation, as the address computation is 64-bit.
1472     __ leaq(counter, Address(string_length, counter, ScaleFactor::TIMES_1, 0));
1473   }
1474 
1475   if (mirror::kUseStringCompression) {
1476     NearLabel uncompressed_string_comparison;
1477     NearLabel comparison_done;
1478     __ testl(CpuRegister(TMP), Immediate(1));
1479     __ j(kNotZero, &uncompressed_string_comparison);
1480     // Check if RAX (search_value) is ASCII.
1481     __ cmpl(search_value, Immediate(127));
1482     __ j(kGreater, &not_found_label);
1483     // Comparing byte-per-byte.
1484     __ repne_scasb();
1485     __ jmp(&comparison_done);
1486     // Everything is set up for repne scasw:
1487     //   * Comparison address in RDI.
1488     //   * Counter in ECX.
1489     __ Bind(&uncompressed_string_comparison);
1490     __ repne_scasw();
1491     __ Bind(&comparison_done);
1492   } else {
1493     __ repne_scasw();
1494   }
1495   // Did we find a match?
1496   __ j(kNotEqual, &not_found_label);
1497 
1498   // Yes, we matched.  Compute the index of the result.
1499   __ subl(string_length, counter);
1500   __ leal(out, Address(string_length, -1));
1501 
1502   NearLabel done;
1503   __ jmp(&done);
1504 
1505   // Failed to match; return -1.
1506   __ Bind(&not_found_label);
1507   __ movl(out, Immediate(-1));
1508 
1509   // And join up at the end.
1510   __ Bind(&done);
1511   if (slow_path != nullptr) {
1512     __ Bind(slow_path->GetExitLabel());
1513   }
1514 }
1515 
1516 void IntrinsicLocationsBuilderX86_64::VisitStringIndexOf(HInvoke* invoke) {
1517   CreateStringIndexOfLocations(invoke, allocator_, /* start_at_zero= */ true);
1518 }
1519 
1520 void IntrinsicCodeGeneratorX86_64::VisitStringIndexOf(HInvoke* invoke) {
1521   GenerateStringIndexOf(invoke, GetAssembler(), codegen_, /* start_at_zero= */ true);
1522 }
1523 
1524 void IntrinsicLocationsBuilderX86_64::VisitStringIndexOfAfter(HInvoke* invoke) {
1525   CreateStringIndexOfLocations(invoke, allocator_, /* start_at_zero= */ false);
1526 }
1527 
1528 void IntrinsicCodeGeneratorX86_64::VisitStringIndexOfAfter(HInvoke* invoke) {
1529   GenerateStringIndexOf(invoke, GetAssembler(), codegen_, /* start_at_zero= */ false);
1530 }
1531 
1532 void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromBytes(HInvoke* invoke) {
1533   LocationSummary* locations = new (allocator_) LocationSummary(
1534       invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
1535   InvokeRuntimeCallingConvention calling_convention;
1536   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
1537   locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
1538   locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
1539   locations->SetInAt(3, Location::RegisterLocation(calling_convention.GetRegisterAt(3)));
1540   locations->SetOut(Location::RegisterLocation(RAX));
1541 }
1542 
1543 void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromBytes(HInvoke* invoke) {
1544   X86_64Assembler* assembler = GetAssembler();
1545   LocationSummary* locations = invoke->GetLocations();
1546 
1547   CpuRegister byte_array = locations->InAt(0).AsRegister<CpuRegister>();
1548   __ testl(byte_array, byte_array);
1549   SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke);
1550   codegen_->AddSlowPath(slow_path);
1551   __ j(kEqual, slow_path->GetEntryLabel());
1552 
1553   codegen_->InvokeRuntime(kQuickAllocStringFromBytes, invoke, invoke->GetDexPc());
1554   CheckEntrypointTypes<kQuickAllocStringFromBytes, void*, void*, int32_t, int32_t, int32_t>();
1555   __ Bind(slow_path->GetExitLabel());
1556 }
1557 
1558 void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromChars(HInvoke* invoke) {
1559   LocationSummary* locations =
1560       new (allocator_) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
1561   InvokeRuntimeCallingConvention calling_convention;
1562   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
1563   locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
1564   locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
1565   locations->SetOut(Location::RegisterLocation(RAX));
1566 }
1567 
1568 void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromChars(HInvoke* invoke) {
1569   // No need to emit code checking whether `locations->InAt(2)` is a null
1570   // pointer, as callers of the native method
1571   //
1572   //   java.lang.StringFactory.newStringFromChars(int offset, int charCount, char[] data)
1573   //
1574   // all include a null check on `data` before calling that method.
1575   codegen_->InvokeRuntime(kQuickAllocStringFromChars, invoke, invoke->GetDexPc());
1576   CheckEntrypointTypes<kQuickAllocStringFromChars, void*, int32_t, int32_t, void*>();
1577 }
1578 
1579 void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromString(HInvoke* invoke) {
1580   LocationSummary* locations = new (allocator_) LocationSummary(
1581       invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
1582   InvokeRuntimeCallingConvention calling_convention;
1583   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
1584   locations->SetOut(Location::RegisterLocation(RAX));
1585 }
1586 
1587 void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromString(HInvoke* invoke) {
1588   X86_64Assembler* assembler = GetAssembler();
1589   LocationSummary* locations = invoke->GetLocations();
1590 
1591   CpuRegister string_to_copy = locations->InAt(0).AsRegister<CpuRegister>();
1592   __ testl(string_to_copy, string_to_copy);
1593   SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke);
1594   codegen_->AddSlowPath(slow_path);
1595   __ j(kEqual, slow_path->GetEntryLabel());
1596 
1597   codegen_->InvokeRuntime(kQuickAllocStringFromString, invoke, invoke->GetDexPc());
1598   CheckEntrypointTypes<kQuickAllocStringFromString, void*, void*>();
1599   __ Bind(slow_path->GetExitLabel());
1600 }
1601 
1602 void IntrinsicLocationsBuilderX86_64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
1603   // public void getChars(int srcBegin, int srcEnd, char[] dst, int dstBegin);
1604   LocationSummary* locations =
1605       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
1606   locations->SetInAt(0, Location::RequiresRegister());
1607   locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1)));
1608   locations->SetInAt(2, Location::RequiresRegister());
1609   locations->SetInAt(3, Location::RequiresRegister());
1610   locations->SetInAt(4, Location::RequiresRegister());
1611 
1612   // And we need some temporaries.  We will use REP MOVSW, so we need fixed registers.
1613   locations->AddTemp(Location::RegisterLocation(RSI));
1614   locations->AddTemp(Location::RegisterLocation(RDI));
1615   locations->AddTemp(Location::RegisterLocation(RCX));
1616 }
1617 
1618 void IntrinsicCodeGeneratorX86_64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
1619   X86_64Assembler* assembler = GetAssembler();
1620   LocationSummary* locations = invoke->GetLocations();
1621 
1622   size_t char_component_size = DataType::Size(DataType::Type::kUint16);
1623   // Location of data in char array buffer.
1624   const uint32_t data_offset = mirror::Array::DataOffset(char_component_size).Uint32Value();
1625   // Location of char array data in string.
1626   const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value();
1627 
1628   // public void getChars(int srcBegin, int srcEnd, char[] dst, int dstBegin);
1629   CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
1630   Location srcBegin = locations->InAt(1);
1631   int srcBegin_value =
1632     srcBegin.IsConstant() ? srcBegin.GetConstant()->AsIntConstant()->GetValue() : 0;
1633   CpuRegister srcEnd = locations->InAt(2).AsRegister<CpuRegister>();
1634   CpuRegister dst = locations->InAt(3).AsRegister<CpuRegister>();
1635   CpuRegister dstBegin = locations->InAt(4).AsRegister<CpuRegister>();
1636 
1637   // Check assumption that sizeof(Char) is 2 (used in scaling below).
1638   const size_t char_size = DataType::Size(DataType::Type::kUint16);
1639   DCHECK_EQ(char_size, 2u);
1640 
1641   NearLabel done;
1642   // Compute the number of chars (words) to move.
1643   __ movl(CpuRegister(RCX), srcEnd);
1644   if (srcBegin.IsConstant()) {
1645     __ subl(CpuRegister(RCX), Immediate(srcBegin_value));
1646   } else {
1647     DCHECK(srcBegin.IsRegister());
1648     __ subl(CpuRegister(RCX), srcBegin.AsRegister<CpuRegister>());
1649   }
1650   if (mirror::kUseStringCompression) {
1651     NearLabel copy_uncompressed, copy_loop;
1652     const size_t c_char_size = DataType::Size(DataType::Type::kInt8);
1653     DCHECK_EQ(c_char_size, 1u);
1654     // Location of count in string.
1655     const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
1656 
1657     __ testl(Address(obj, count_offset), Immediate(1));
1658     static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
1659                   "Expecting 0=compressed, 1=uncompressed");
1660     __ j(kNotZero, &copy_uncompressed);
1661     // Compute the address of the source string by adding the number of chars from
1662     // the source beginning to the value offset of a string.
1663     __ leaq(CpuRegister(RSI),
1664             CodeGeneratorX86_64::ArrayAddress(obj, srcBegin, TIMES_1, value_offset));
1665     // Start the loop to copy String's value to Array of Char.
1666     __ leaq(CpuRegister(RDI), Address(dst, dstBegin, ScaleFactor::TIMES_2, data_offset));
1667 
1668     __ Bind(&copy_loop);
1669     __ jrcxz(&done);
1670     // Use TMP as temporary (convert byte from RSI to word).
1671     // TODO: Selecting RAX as the temporary and using LODSB/STOSW.
1672     __ movzxb(CpuRegister(TMP), Address(CpuRegister(RSI), 0));
1673     __ movw(Address(CpuRegister(RDI), 0), CpuRegister(TMP));
1674     __ leaq(CpuRegister(RDI), Address(CpuRegister(RDI), char_size));
1675     __ leaq(CpuRegister(RSI), Address(CpuRegister(RSI), c_char_size));
1676     // TODO: Add support for LOOP to X86_64Assembler.
1677     __ subl(CpuRegister(RCX), Immediate(1));
1678     __ jmp(&copy_loop);
1679 
1680     __ Bind(&copy_uncompressed);
1681   }
1682 
1683   __ leaq(CpuRegister(RSI),
1684           CodeGeneratorX86_64::ArrayAddress(obj, srcBegin, TIMES_2, value_offset));
1685   // Compute the address of the destination buffer.
1686   __ leaq(CpuRegister(RDI), Address(dst, dstBegin, ScaleFactor::TIMES_2, data_offset));
1687   // Do the move.
1688   __ rep_movsw();
1689 
1690   __ Bind(&done);
1691 }
1692 
1693 static void GenPeek(LocationSummary* locations, DataType::Type size, X86_64Assembler* assembler) {
1694   CpuRegister address = locations->InAt(0).AsRegister<CpuRegister>();
1695   CpuRegister out = locations->Out().AsRegister<CpuRegister>();  // == address, here for clarity.
1696   // x86 allows unaligned access. We do not have to check the input or use specific instructions
1697   // to avoid a SIGBUS.
1698   switch (size) {
1699     case DataType::Type::kInt8:
1700       __ movsxb(out, Address(address, 0));
1701       break;
1702     case DataType::Type::kInt16:
1703       __ movsxw(out, Address(address, 0));
1704       break;
1705     case DataType::Type::kInt32:
1706       __ movl(out, Address(address, 0));
1707       break;
1708     case DataType::Type::kInt64:
1709       __ movq(out, Address(address, 0));
1710       break;
1711     default:
1712       LOG(FATAL) << "Type not recognized for peek: " << size;
1713       UNREACHABLE();
1714   }
1715 }
1716 
1717 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekByte(HInvoke* invoke) {
1718   CreateIntToIntLocations(allocator_, invoke);
1719 }
1720 
1721 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekByte(HInvoke* invoke) {
1722   GenPeek(invoke->GetLocations(), DataType::Type::kInt8, GetAssembler());
1723 }
1724 
1725 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekIntNative(HInvoke* invoke) {
1726   CreateIntToIntLocations(allocator_, invoke);
1727 }
1728 
1729 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekIntNative(HInvoke* invoke) {
1730   GenPeek(invoke->GetLocations(), DataType::Type::kInt32, GetAssembler());
1731 }
1732 
1733 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekLongNative(HInvoke* invoke) {
1734   CreateIntToIntLocations(allocator_, invoke);
1735 }
1736 
1737 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekLongNative(HInvoke* invoke) {
1738   GenPeek(invoke->GetLocations(), DataType::Type::kInt64, GetAssembler());
1739 }
1740 
1741 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekShortNative(HInvoke* invoke) {
1742   CreateIntToIntLocations(allocator_, invoke);
1743 }
1744 
1745 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekShortNative(HInvoke* invoke) {
1746   GenPeek(invoke->GetLocations(), DataType::Type::kInt16, GetAssembler());
1747 }
1748 
1749 static void CreateIntIntToVoidLocations(ArenaAllocator* allocator, HInvoke* invoke) {
1750   LocationSummary* locations =
1751       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
1752   locations->SetInAt(0, Location::RequiresRegister());
1753   locations->SetInAt(1, Location::RegisterOrInt32Constant(invoke->InputAt(1)));
1754 }
1755 
1756 static void GenPoke(LocationSummary* locations, DataType::Type size, X86_64Assembler* assembler) {
1757   CpuRegister address = locations->InAt(0).AsRegister<CpuRegister>();
1758   Location value = locations->InAt(1);
1759   // x86 allows unaligned access. We do not have to check the input or use specific instructions
1760   // to avoid a SIGBUS.
1761   switch (size) {
1762     case DataType::Type::kInt8:
1763       if (value.IsConstant()) {
1764         __ movb(Address(address, 0),
1765                 Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant())));
1766       } else {
1767         __ movb(Address(address, 0), value.AsRegister<CpuRegister>());
1768       }
1769       break;
1770     case DataType::Type::kInt16:
1771       if (value.IsConstant()) {
1772         __ movw(Address(address, 0),
1773                 Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant())));
1774       } else {
1775         __ movw(Address(address, 0), value.AsRegister<CpuRegister>());
1776       }
1777       break;
1778     case DataType::Type::kInt32:
1779       if (value.IsConstant()) {
1780         __ movl(Address(address, 0),
1781                 Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant())));
1782       } else {
1783         __ movl(Address(address, 0), value.AsRegister<CpuRegister>());
1784       }
1785       break;
1786     case DataType::Type::kInt64:
1787       if (value.IsConstant()) {
1788         int64_t v = value.GetConstant()->AsLongConstant()->GetValue();
1789         DCHECK(IsInt<32>(v));
1790         int32_t v_32 = v;
1791         __ movq(Address(address, 0), Immediate(v_32));
1792       } else {
1793         __ movq(Address(address, 0), value.AsRegister<CpuRegister>());
1794       }
1795       break;
1796     default:
1797       LOG(FATAL) << "Type not recognized for poke: " << size;
1798       UNREACHABLE();
1799   }
1800 }
1801 
1802 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeByte(HInvoke* invoke) {
1803   CreateIntIntToVoidLocations(allocator_, invoke);
1804 }
1805 
1806 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeByte(HInvoke* invoke) {
1807   GenPoke(invoke->GetLocations(), DataType::Type::kInt8, GetAssembler());
1808 }
1809 
1810 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeIntNative(HInvoke* invoke) {
1811   CreateIntIntToVoidLocations(allocator_, invoke);
1812 }
1813 
1814 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeIntNative(HInvoke* invoke) {
1815   GenPoke(invoke->GetLocations(), DataType::Type::kInt32, GetAssembler());
1816 }
1817 
1818 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeLongNative(HInvoke* invoke) {
1819   CreateIntIntToVoidLocations(allocator_, invoke);
1820 }
1821 
1822 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeLongNative(HInvoke* invoke) {
1823   GenPoke(invoke->GetLocations(), DataType::Type::kInt64, GetAssembler());
1824 }
1825 
1826 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeShortNative(HInvoke* invoke) {
1827   CreateIntIntToVoidLocations(allocator_, invoke);
1828 }
1829 
1830 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeShortNative(HInvoke* invoke) {
1831   GenPoke(invoke->GetLocations(), DataType::Type::kInt16, GetAssembler());
1832 }
1833 
1834 void IntrinsicLocationsBuilderX86_64::VisitThreadCurrentThread(HInvoke* invoke) {
1835   LocationSummary* locations =
1836       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
1837   locations->SetOut(Location::RequiresRegister());
1838 }
1839 
1840 void IntrinsicCodeGeneratorX86_64::VisitThreadCurrentThread(HInvoke* invoke) {
1841   CpuRegister out = invoke->GetLocations()->Out().AsRegister<CpuRegister>();
1842   GetAssembler()->gs()->movl(out, Address::Absolute(Thread::PeerOffset<kX86_64PointerSize>(),
1843                                                     /* no_rip= */ true));
1844 }
1845 
1846 static void GenUnsafeGet(HInvoke* invoke,
1847                          DataType::Type type,
1848                          bool is_volatile ATTRIBUTE_UNUSED,
1849                          CodeGeneratorX86_64* codegen) {
1850   X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler());
1851   LocationSummary* locations = invoke->GetLocations();
1852   Location base_loc = locations->InAt(1);
1853   CpuRegister base = base_loc.AsRegister<CpuRegister>();
1854   Location offset_loc = locations->InAt(2);
1855   CpuRegister offset = offset_loc.AsRegister<CpuRegister>();
1856   Location output_loc = locations->Out();
1857   CpuRegister output = output_loc.AsRegister<CpuRegister>();
1858 
1859   switch (type) {
1860     case DataType::Type::kInt32:
1861       __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
1862       break;
1863 
1864     case DataType::Type::kReference: {
1865       if (kEmitCompilerReadBarrier) {
1866         if (kUseBakerReadBarrier) {
1867           Address src(base, offset, ScaleFactor::TIMES_1, 0);
1868           codegen->GenerateReferenceLoadWithBakerReadBarrier(
1869               invoke, output_loc, base, src, /* needs_null_check= */ false);
1870         } else {
1871           __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
1872           codegen->GenerateReadBarrierSlow(
1873               invoke, output_loc, output_loc, base_loc, 0U, offset_loc);
1874         }
1875       } else {
1876         __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
1877         __ MaybeUnpoisonHeapReference(output);
1878       }
1879       break;
1880     }
1881 
1882     case DataType::Type::kInt64:
1883       __ movq(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
1884       break;
1885 
1886     default:
1887       LOG(FATAL) << "Unsupported op size " << type;
1888       UNREACHABLE();
1889   }
1890 }
1891 
1892 static void CreateIntIntIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
1893   bool can_call = kEmitCompilerReadBarrier &&
1894       (invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject ||
1895        invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile);
1896   LocationSummary* locations =
1897       new (allocator) LocationSummary(invoke,
1898                                       can_call
1899                                           ? LocationSummary::kCallOnSlowPath
1900                                           : LocationSummary::kNoCall,
1901                                       kIntrinsified);
1902   if (can_call && kUseBakerReadBarrier) {
1903     locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
1904   }
1905   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
1906   locations->SetInAt(1, Location::RequiresRegister());
1907   locations->SetInAt(2, Location::RequiresRegister());
1908   locations->SetOut(Location::RequiresRegister(),
1909                     (can_call ? Location::kOutputOverlap : Location::kNoOutputOverlap));
1910 }
1911 
1912 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGet(HInvoke* invoke) {
1913   CreateIntIntIntToIntLocations(allocator_, invoke);
1914 }
1915 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetVolatile(HInvoke* invoke) {
1916   CreateIntIntIntToIntLocations(allocator_, invoke);
1917 }
1918 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetLong(HInvoke* invoke) {
1919   CreateIntIntIntToIntLocations(allocator_, invoke);
1920 }
1921 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
1922   CreateIntIntIntToIntLocations(allocator_, invoke);
1923 }
1924 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetObject(HInvoke* invoke) {
1925   CreateIntIntIntToIntLocations(allocator_, invoke);
1926 }
1927 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
1928   CreateIntIntIntToIntLocations(allocator_, invoke);
1929 }
1930 
1931 
1932 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGet(HInvoke* invoke) {
1933   GenUnsafeGet(invoke, DataType::Type::kInt32, /* is_volatile= */ false, codegen_);
1934 }
1935 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetVolatile(HInvoke* invoke) {
1936   GenUnsafeGet(invoke, DataType::Type::kInt32, /* is_volatile= */ true, codegen_);
1937 }
1938 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetLong(HInvoke* invoke) {
1939   GenUnsafeGet(invoke, DataType::Type::kInt64, /* is_volatile= */ false, codegen_);
1940 }
1941 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
1942   GenUnsafeGet(invoke, DataType::Type::kInt64, /* is_volatile= */ true, codegen_);
1943 }
1944 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetObject(HInvoke* invoke) {
1945   GenUnsafeGet(invoke, DataType::Type::kReference, /* is_volatile= */ false, codegen_);
1946 }
1947 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
1948   GenUnsafeGet(invoke, DataType::Type::kReference, /* is_volatile= */ true, codegen_);
1949 }
1950 
1951 
1952 static void CreateIntIntIntIntToVoidPlusTempsLocations(ArenaAllocator* allocator,
1953                                                        DataType::Type type,
1954                                                        HInvoke* invoke) {
1955   LocationSummary* locations =
1956       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
1957   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
1958   locations->SetInAt(1, Location::RequiresRegister());
1959   locations->SetInAt(2, Location::RequiresRegister());
1960   locations->SetInAt(3, Location::RequiresRegister());
1961   if (type == DataType::Type::kReference) {
1962     // Need temp registers for card-marking.
1963     locations->AddTemp(Location::RequiresRegister());  // Possibly used for reference poisoning too.
1964     locations->AddTemp(Location::RequiresRegister());
1965   }
1966 }
1967 
1968 void IntrinsicLocationsBuilderX86_64::VisitUnsafePut(HInvoke* invoke) {
1969   CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt32, invoke);
1970 }
1971 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutOrdered(HInvoke* invoke) {
1972   CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt32, invoke);
1973 }
1974 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutVolatile(HInvoke* invoke) {
1975   CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt32, invoke);
1976 }
1977 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObject(HInvoke* invoke) {
1978   CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kReference, invoke);
1979 }
1980 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
1981   CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kReference, invoke);
1982 }
1983 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
1984   CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kReference, invoke);
1985 }
1986 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLong(HInvoke* invoke) {
1987   CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt64, invoke);
1988 }
1989 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLongOrdered(HInvoke* invoke) {
1990   CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt64, invoke);
1991 }
1992 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLongVolatile(HInvoke* invoke) {
1993   CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt64, invoke);
1994 }
1995 
1996 // We don't care for ordered: it requires an AnyStore barrier, which is already given by the x86
1997 // memory model.
1998 static void GenUnsafePut(LocationSummary* locations, DataType::Type type, bool is_volatile,
1999                          CodeGeneratorX86_64* codegen) {
2000   X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler());
2001   CpuRegister base = locations->InAt(1).AsRegister<CpuRegister>();
2002   CpuRegister offset = locations->InAt(2).AsRegister<CpuRegister>();
2003   CpuRegister value = locations->InAt(3).AsRegister<CpuRegister>();
2004 
2005   if (type == DataType::Type::kInt64) {
2006     __ movq(Address(base, offset, ScaleFactor::TIMES_1, 0), value);
2007   } else if (kPoisonHeapReferences && type == DataType::Type::kReference) {
2008     CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
2009     __ movl(temp, value);
2010     __ PoisonHeapReference(temp);
2011     __ movl(Address(base, offset, ScaleFactor::TIMES_1, 0), temp);
2012   } else {
2013     __ movl(Address(base, offset, ScaleFactor::TIMES_1, 0), value);
2014   }
2015 
2016   if (is_volatile) {
2017     codegen->MemoryFence();
2018   }
2019 
2020   if (type == DataType::Type::kReference) {
2021     bool value_can_be_null = true;  // TODO: Worth finding out this information?
2022     codegen->MarkGCCard(locations->GetTemp(0).AsRegister<CpuRegister>(),
2023                         locations->GetTemp(1).AsRegister<CpuRegister>(),
2024                         base,
2025                         value,
2026                         value_can_be_null);
2027   }
2028 }
2029 
2030 void IntrinsicCodeGeneratorX86_64::VisitUnsafePut(HInvoke* invoke) {
2031   GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt32, /* is_volatile= */ false, codegen_);
2032 }
2033 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutOrdered(HInvoke* invoke) {
2034   GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt32, /* is_volatile= */ false, codegen_);
2035 }
2036 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutVolatile(HInvoke* invoke) {
2037   GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt32, /* is_volatile= */ true, codegen_);
2038 }
2039 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObject(HInvoke* invoke) {
2040   GenUnsafePut(
2041       invoke->GetLocations(), DataType::Type::kReference, /* is_volatile= */ false, codegen_);
2042 }
2043 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
2044   GenUnsafePut(
2045       invoke->GetLocations(), DataType::Type::kReference, /* is_volatile= */ false, codegen_);
2046 }
2047 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
2048   GenUnsafePut(
2049       invoke->GetLocations(), DataType::Type::kReference, /* is_volatile= */ true, codegen_);
2050 }
2051 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLong(HInvoke* invoke) {
2052   GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt64, /* is_volatile= */ false, codegen_);
2053 }
2054 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLongOrdered(HInvoke* invoke) {
2055   GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt64, /* is_volatile= */ false, codegen_);
2056 }
2057 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLongVolatile(HInvoke* invoke) {
2058   GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt64, /* is_volatile= */ true, codegen_);
2059 }
2060 
2061 static void CreateIntIntIntIntIntToInt(ArenaAllocator* allocator,
2062                                        DataType::Type type,
2063                                        HInvoke* invoke) {
2064   bool can_call = kEmitCompilerReadBarrier &&
2065       kUseBakerReadBarrier &&
2066       (invoke->GetIntrinsic() == Intrinsics::kUnsafeCASObject);
2067   LocationSummary* locations =
2068       new (allocator) LocationSummary(invoke,
2069                                       can_call
2070                                           ? LocationSummary::kCallOnSlowPath
2071                                           : LocationSummary::kNoCall,
2072                                       kIntrinsified);
2073   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
2074   locations->SetInAt(1, Location::RequiresRegister());
2075   locations->SetInAt(2, Location::RequiresRegister());
2076   // expected value must be in EAX/RAX.
2077   locations->SetInAt(3, Location::RegisterLocation(RAX));
2078   locations->SetInAt(4, Location::RequiresRegister());
2079 
2080   locations->SetOut(Location::RequiresRegister());
2081   if (type == DataType::Type::kReference) {
2082     // Need temporary registers for card-marking, and possibly for
2083     // (Baker) read barrier.
2084     locations->AddTemp(Location::RequiresRegister());  // Possibly used for reference poisoning too.
2085     locations->AddTemp(Location::RequiresRegister());
2086   }
2087 }
2088 
2089 void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASInt(HInvoke* invoke) {
2090   CreateIntIntIntIntIntToInt(allocator_, DataType::Type::kInt32, invoke);
2091 }
2092 
2093 void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASLong(HInvoke* invoke) {
2094   CreateIntIntIntIntIntToInt(allocator_, DataType::Type::kInt64, invoke);
2095 }
2096 
2097 void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASObject(HInvoke* invoke) {
2098   // The only read barrier implementation supporting the
2099   // UnsafeCASObject intrinsic is the Baker-style read barriers.
2100   if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
2101     return;
2102   }
2103 
2104   CreateIntIntIntIntIntToInt(allocator_, DataType::Type::kReference, invoke);
2105 }
2106 
2107 static void GenCAS(DataType::Type type, HInvoke* invoke, CodeGeneratorX86_64* codegen) {
2108   X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler());
2109   LocationSummary* locations = invoke->GetLocations();
2110 
2111   CpuRegister base = locations->InAt(1).AsRegister<CpuRegister>();
2112   CpuRegister offset = locations->InAt(2).AsRegister<CpuRegister>();
2113   CpuRegister expected = locations->InAt(3).AsRegister<CpuRegister>();
2114   // Ensure `expected` is in RAX (required by the CMPXCHG instruction).
2115   DCHECK_EQ(expected.AsRegister(), RAX);
2116   CpuRegister value = locations->InAt(4).AsRegister<CpuRegister>();
2117   Location out_loc = locations->Out();
2118   CpuRegister out = out_loc.AsRegister<CpuRegister>();
2119 
2120   if (type == DataType::Type::kReference) {
2121     // The only read barrier implementation supporting the
2122     // UnsafeCASObject intrinsic is the Baker-style read barriers.
2123     DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
2124 
2125     CpuRegister temp1 = locations->GetTemp(0).AsRegister<CpuRegister>();
2126     CpuRegister temp2 = locations->GetTemp(1).AsRegister<CpuRegister>();
2127 
2128     // Mark card for object assuming new value is stored.
2129     bool value_can_be_null = true;  // TODO: Worth finding out this information?
2130     codegen->MarkGCCard(temp1, temp2, base, value, value_can_be_null);
2131 
2132     // The address of the field within the holding object.
2133     Address field_addr(base, offset, ScaleFactor::TIMES_1, 0);
2134 
2135     if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
2136       // Need to make sure the reference stored in the field is a to-space
2137       // one before attempting the CAS or the CAS could fail incorrectly.
2138       codegen->GenerateReferenceLoadWithBakerReadBarrier(
2139           invoke,
2140           out_loc,  // Unused, used only as a "temporary" within the read barrier.
2141           base,
2142           field_addr,
2143           /* needs_null_check= */ false,
2144           /* always_update_field= */ true,
2145           &temp1,
2146           &temp2);
2147     }
2148 
2149     bool base_equals_value = (base.AsRegister() == value.AsRegister());
2150     Register value_reg = value.AsRegister();
2151     if (kPoisonHeapReferences) {
2152       if (base_equals_value) {
2153         // If `base` and `value` are the same register location, move
2154         // `value_reg` to a temporary register.  This way, poisoning
2155         // `value_reg` won't invalidate `base`.
2156         value_reg = temp1.AsRegister();
2157         __ movl(CpuRegister(value_reg), base);
2158       }
2159 
2160       // Check that the register allocator did not assign the location
2161       // of `expected` (RAX) to `value` nor to `base`, so that heap
2162       // poisoning (when enabled) works as intended below.
2163       // - If `value` were equal to `expected`, both references would
2164       //   be poisoned twice, meaning they would not be poisoned at
2165       //   all, as heap poisoning uses address negation.
2166       // - If `base` were equal to `expected`, poisoning `expected`
2167       //   would invalidate `base`.
2168       DCHECK_NE(value_reg, expected.AsRegister());
2169       DCHECK_NE(base.AsRegister(), expected.AsRegister());
2170 
2171       __ PoisonHeapReference(expected);
2172       __ PoisonHeapReference(CpuRegister(value_reg));
2173     }
2174 
2175     __ LockCmpxchgl(field_addr, CpuRegister(value_reg));
2176 
2177     // LOCK CMPXCHG has full barrier semantics, and we don't need
2178     // scheduling barriers at this time.
2179 
2180     // Convert ZF into the Boolean result.
2181     __ setcc(kZero, out);
2182     __ movzxb(out, out);
2183 
2184     // If heap poisoning is enabled, we need to unpoison the values
2185     // that were poisoned earlier.
2186     if (kPoisonHeapReferences) {
2187       if (base_equals_value) {
2188         // `value_reg` has been moved to a temporary register, no need
2189         // to unpoison it.
2190       } else {
2191         // Ensure `value` is different from `out`, so that unpoisoning
2192         // the former does not invalidate the latter.
2193         DCHECK_NE(value_reg, out.AsRegister());
2194         __ UnpoisonHeapReference(CpuRegister(value_reg));
2195       }
2196       // Ensure `expected` is different from `out`, so that unpoisoning
2197       // the former does not invalidate the latter.
2198       DCHECK_NE(expected.AsRegister(), out.AsRegister());
2199       __ UnpoisonHeapReference(expected);
2200     }
2201   } else {
2202     if (type == DataType::Type::kInt32) {
2203       __ LockCmpxchgl(Address(base, offset, TIMES_1, 0), value);
2204     } else if (type == DataType::Type::kInt64) {
2205       __ LockCmpxchgq(Address(base, offset, TIMES_1, 0), value);
2206     } else {
2207       LOG(FATAL) << "Unexpected CAS type " << type;
2208     }
2209 
2210     // LOCK CMPXCHG has full barrier semantics, and we don't need
2211     // scheduling barriers at this time.
2212 
2213     // Convert ZF into the Boolean result.
2214     __ setcc(kZero, out);
2215     __ movzxb(out, out);
2216   }
2217 }
2218 
2219 void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASInt(HInvoke* invoke) {
2220   GenCAS(DataType::Type::kInt32, invoke, codegen_);
2221 }
2222 
2223 void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASLong(HInvoke* invoke) {
2224   GenCAS(DataType::Type::kInt64, invoke, codegen_);
2225 }
2226 
2227 void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASObject(HInvoke* invoke) {
2228   // The only read barrier implementation supporting the
2229   // UnsafeCASObject intrinsic is the Baker-style read barriers.
2230   DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
2231 
2232   GenCAS(DataType::Type::kReference, invoke, codegen_);
2233 }
2234 
2235 void IntrinsicLocationsBuilderX86_64::VisitIntegerReverse(HInvoke* invoke) {
2236   LocationSummary* locations =
2237       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2238   locations->SetInAt(0, Location::RequiresRegister());
2239   locations->SetOut(Location::SameAsFirstInput());
2240   locations->AddTemp(Location::RequiresRegister());
2241 }
2242 
2243 static void SwapBits(CpuRegister reg, CpuRegister temp, int32_t shift, int32_t mask,
2244                      X86_64Assembler* assembler) {
2245   Immediate imm_shift(shift);
2246   Immediate imm_mask(mask);
2247   __ movl(temp, reg);
2248   __ shrl(reg, imm_shift);
2249   __ andl(temp, imm_mask);
2250   __ andl(reg, imm_mask);
2251   __ shll(temp, imm_shift);
2252   __ orl(reg, temp);
2253 }
2254 
2255 void IntrinsicCodeGeneratorX86_64::VisitIntegerReverse(HInvoke* invoke) {
2256   X86_64Assembler* assembler = GetAssembler();
2257   LocationSummary* locations = invoke->GetLocations();
2258 
2259   CpuRegister reg = locations->InAt(0).AsRegister<CpuRegister>();
2260   CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
2261 
2262   /*
2263    * Use one bswap instruction to reverse byte order first and then use 3 rounds of
2264    * swapping bits to reverse bits in a number x. Using bswap to save instructions
2265    * compared to generic luni implementation which has 5 rounds of swapping bits.
2266    * x = bswap x
2267    * x = (x & 0x55555555) << 1 | (x >> 1) & 0x55555555;
2268    * x = (x & 0x33333333) << 2 | (x >> 2) & 0x33333333;
2269    * x = (x & 0x0F0F0F0F) << 4 | (x >> 4) & 0x0F0F0F0F;
2270    */
2271   __ bswapl(reg);
2272   SwapBits(reg, temp, 1, 0x55555555, assembler);
2273   SwapBits(reg, temp, 2, 0x33333333, assembler);
2274   SwapBits(reg, temp, 4, 0x0f0f0f0f, assembler);
2275 }
2276 
2277 void IntrinsicLocationsBuilderX86_64::VisitLongReverse(HInvoke* invoke) {
2278   LocationSummary* locations =
2279       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2280   locations->SetInAt(0, Location::RequiresRegister());
2281   locations->SetOut(Location::SameAsFirstInput());
2282   locations->AddTemp(Location::RequiresRegister());
2283   locations->AddTemp(Location::RequiresRegister());
2284 }
2285 
2286 static void SwapBits64(CpuRegister reg, CpuRegister temp, CpuRegister temp_mask,
2287                        int32_t shift, int64_t mask, X86_64Assembler* assembler) {
2288   Immediate imm_shift(shift);
2289   __ movq(temp_mask, Immediate(mask));
2290   __ movq(temp, reg);
2291   __ shrq(reg, imm_shift);
2292   __ andq(temp, temp_mask);
2293   __ andq(reg, temp_mask);
2294   __ shlq(temp, imm_shift);
2295   __ orq(reg, temp);
2296 }
2297 
2298 void IntrinsicCodeGeneratorX86_64::VisitLongReverse(HInvoke* invoke) {
2299   X86_64Assembler* assembler = GetAssembler();
2300   LocationSummary* locations = invoke->GetLocations();
2301 
2302   CpuRegister reg = locations->InAt(0).AsRegister<CpuRegister>();
2303   CpuRegister temp1 = locations->GetTemp(0).AsRegister<CpuRegister>();
2304   CpuRegister temp2 = locations->GetTemp(1).AsRegister<CpuRegister>();
2305 
2306   /*
2307    * Use one bswap instruction to reverse byte order first and then use 3 rounds of
2308    * swapping bits to reverse bits in a long number x. Using bswap to save instructions
2309    * compared to generic luni implementation which has 5 rounds of swapping bits.
2310    * x = bswap x
2311    * x = (x & 0x5555555555555555) << 1 | (x >> 1) & 0x5555555555555555;
2312    * x = (x & 0x3333333333333333) << 2 | (x >> 2) & 0x3333333333333333;
2313    * x = (x & 0x0F0F0F0F0F0F0F0F) << 4 | (x >> 4) & 0x0F0F0F0F0F0F0F0F;
2314    */
2315   __ bswapq(reg);
2316   SwapBits64(reg, temp1, temp2, 1, INT64_C(0x5555555555555555), assembler);
2317   SwapBits64(reg, temp1, temp2, 2, INT64_C(0x3333333333333333), assembler);
2318   SwapBits64(reg, temp1, temp2, 4, INT64_C(0x0f0f0f0f0f0f0f0f), assembler);
2319 }
2320 
2321 static void CreateBitCountLocations(
2322     ArenaAllocator* allocator, CodeGeneratorX86_64* codegen, HInvoke* invoke) {
2323   if (!codegen->GetInstructionSetFeatures().HasPopCnt()) {
2324     // Do nothing if there is no popcnt support. This results in generating
2325     // a call for the intrinsic rather than direct code.
2326     return;
2327   }
2328   LocationSummary* locations =
2329       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2330   locations->SetInAt(0, Location::Any());
2331   locations->SetOut(Location::RequiresRegister());
2332 }
2333 
2334 static void GenBitCount(X86_64Assembler* assembler,
2335                         CodeGeneratorX86_64* codegen,
2336                         HInvoke* invoke,
2337                         bool is_long) {
2338   LocationSummary* locations = invoke->GetLocations();
2339   Location src = locations->InAt(0);
2340   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
2341 
2342   if (invoke->InputAt(0)->IsConstant()) {
2343     // Evaluate this at compile time.
2344     int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
2345     int32_t result = is_long
2346         ? POPCOUNT(static_cast<uint64_t>(value))
2347         : POPCOUNT(static_cast<uint32_t>(value));
2348     codegen->Load32BitValue(out, result);
2349     return;
2350   }
2351 
2352   if (src.IsRegister()) {
2353     if (is_long) {
2354       __ popcntq(out, src.AsRegister<CpuRegister>());
2355     } else {
2356       __ popcntl(out, src.AsRegister<CpuRegister>());
2357     }
2358   } else if (is_long) {
2359     DCHECK(src.IsDoubleStackSlot());
2360     __ popcntq(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2361   } else {
2362     DCHECK(src.IsStackSlot());
2363     __ popcntl(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2364   }
2365 }
2366 
2367 void IntrinsicLocationsBuilderX86_64::VisitIntegerBitCount(HInvoke* invoke) {
2368   CreateBitCountLocations(allocator_, codegen_, invoke);
2369 }
2370 
2371 void IntrinsicCodeGeneratorX86_64::VisitIntegerBitCount(HInvoke* invoke) {
2372   GenBitCount(GetAssembler(), codegen_, invoke, /* is_long= */ false);
2373 }
2374 
2375 void IntrinsicLocationsBuilderX86_64::VisitLongBitCount(HInvoke* invoke) {
2376   CreateBitCountLocations(allocator_, codegen_, invoke);
2377 }
2378 
2379 void IntrinsicCodeGeneratorX86_64::VisitLongBitCount(HInvoke* invoke) {
2380   GenBitCount(GetAssembler(), codegen_, invoke, /* is_long= */ true);
2381 }
2382 
2383 static void CreateOneBitLocations(ArenaAllocator* allocator, HInvoke* invoke, bool is_high) {
2384   LocationSummary* locations =
2385       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2386   locations->SetInAt(0, Location::Any());
2387   locations->SetOut(Location::RequiresRegister());
2388   locations->AddTemp(is_high ? Location::RegisterLocation(RCX)  // needs CL
2389                              : Location::RequiresRegister());  // any will do
2390 }
2391 
2392 static void GenOneBit(X86_64Assembler* assembler,
2393                       CodeGeneratorX86_64* codegen,
2394                       HInvoke* invoke,
2395                       bool is_high, bool is_long) {
2396   LocationSummary* locations = invoke->GetLocations();
2397   Location src = locations->InAt(0);
2398   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
2399 
2400   if (invoke->InputAt(0)->IsConstant()) {
2401     // Evaluate this at compile time.
2402     int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
2403     if (value == 0) {
2404       __ xorl(out, out);  // Clears upper bits too.
2405       return;
2406     }
2407     // Nonzero value.
2408     if (is_high) {
2409       value = is_long ? 63 - CLZ(static_cast<uint64_t>(value))
2410                       : 31 - CLZ(static_cast<uint32_t>(value));
2411     } else {
2412       value = is_long ? CTZ(static_cast<uint64_t>(value))
2413                       : CTZ(static_cast<uint32_t>(value));
2414     }
2415     if (is_long) {
2416       codegen->Load64BitValue(out, 1ULL << value);
2417     } else {
2418       codegen->Load32BitValue(out, 1 << value);
2419     }
2420     return;
2421   }
2422 
2423   // Handle the non-constant cases.
2424   if (!is_high && codegen->GetInstructionSetFeatures().HasAVX2() &&
2425       src.IsRegister()) {
2426       __ blsi(out, src.AsRegister<CpuRegister>());
2427   } else {
2428     CpuRegister tmp = locations->GetTemp(0).AsRegister<CpuRegister>();
2429     if (is_high) {
2430       // Use architectural support: basically 1 << bsr.
2431       if (src.IsRegister()) {
2432         if (is_long) {
2433           __ bsrq(tmp, src.AsRegister<CpuRegister>());
2434         } else {
2435           __ bsrl(tmp, src.AsRegister<CpuRegister>());
2436         }
2437       } else if (is_long) {
2438         DCHECK(src.IsDoubleStackSlot());
2439         __ bsrq(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
2440       } else {
2441         DCHECK(src.IsStackSlot());
2442         __ bsrl(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
2443       }
2444       // BSR sets ZF if the input was zero.
2445       NearLabel is_zero, done;
2446       __ j(kEqual, &is_zero);
2447       __ movl(out, Immediate(1));  // Clears upper bits too.
2448       if (is_long) {
2449         __ shlq(out, tmp);
2450       } else {
2451         __ shll(out, tmp);
2452       }
2453       __ jmp(&done);
2454       __ Bind(&is_zero);
2455       __ xorl(out, out);  // Clears upper bits too.
2456       __ Bind(&done);
2457     } else  {
2458       // Copy input into temporary.
2459       if (src.IsRegister()) {
2460         if (is_long) {
2461           __ movq(tmp, src.AsRegister<CpuRegister>());
2462         } else {
2463           __ movl(tmp, src.AsRegister<CpuRegister>());
2464         }
2465       } else if (is_long) {
2466         DCHECK(src.IsDoubleStackSlot());
2467         __ movq(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
2468       } else {
2469         DCHECK(src.IsStackSlot());
2470         __ movl(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
2471       }
2472       // Do the bit twiddling: basically tmp & -tmp;
2473       if (is_long) {
2474         __ movq(out, tmp);
2475         __ negq(tmp);
2476         __ andq(out, tmp);
2477       } else {
2478         __ movl(out, tmp);
2479         __ negl(tmp);
2480         __ andl(out, tmp);
2481       }
2482     }
2483   }
2484 }
2485 
2486 void IntrinsicLocationsBuilderX86_64::VisitIntegerHighestOneBit(HInvoke* invoke) {
2487   CreateOneBitLocations(allocator_, invoke, /* is_high= */ true);
2488 }
2489 
2490 void IntrinsicCodeGeneratorX86_64::VisitIntegerHighestOneBit(HInvoke* invoke) {
2491   GenOneBit(GetAssembler(), codegen_, invoke, /* is_high= */ true, /* is_long= */ false);
2492 }
2493 
2494 void IntrinsicLocationsBuilderX86_64::VisitLongHighestOneBit(HInvoke* invoke) {
2495   CreateOneBitLocations(allocator_, invoke, /* is_high= */ true);
2496 }
2497 
2498 void IntrinsicCodeGeneratorX86_64::VisitLongHighestOneBit(HInvoke* invoke) {
2499   GenOneBit(GetAssembler(), codegen_, invoke, /* is_high= */ true, /* is_long= */ true);
2500 }
2501 
2502 void IntrinsicLocationsBuilderX86_64::VisitIntegerLowestOneBit(HInvoke* invoke) {
2503   CreateOneBitLocations(allocator_, invoke, /* is_high= */ false);
2504 }
2505 
2506 void IntrinsicCodeGeneratorX86_64::VisitIntegerLowestOneBit(HInvoke* invoke) {
2507   GenOneBit(GetAssembler(), codegen_, invoke, /* is_high= */ false, /* is_long= */ false);
2508 }
2509 
2510 void IntrinsicLocationsBuilderX86_64::VisitLongLowestOneBit(HInvoke* invoke) {
2511   CreateOneBitLocations(allocator_, invoke, /* is_high= */ false);
2512 }
2513 
2514 void IntrinsicCodeGeneratorX86_64::VisitLongLowestOneBit(HInvoke* invoke) {
2515   GenOneBit(GetAssembler(), codegen_, invoke, /* is_high= */ false, /* is_long= */ true);
2516 }
2517 
2518 static void CreateLeadingZeroLocations(ArenaAllocator* allocator, HInvoke* invoke) {
2519   LocationSummary* locations =
2520       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2521   locations->SetInAt(0, Location::Any());
2522   locations->SetOut(Location::RequiresRegister());
2523 }
2524 
2525 static void GenLeadingZeros(X86_64Assembler* assembler,
2526                             CodeGeneratorX86_64* codegen,
2527                             HInvoke* invoke, bool is_long) {
2528   LocationSummary* locations = invoke->GetLocations();
2529   Location src = locations->InAt(0);
2530   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
2531 
2532   int zero_value_result = is_long ? 64 : 32;
2533   if (invoke->InputAt(0)->IsConstant()) {
2534     // Evaluate this at compile time.
2535     int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
2536     if (value == 0) {
2537       value = zero_value_result;
2538     } else {
2539       value = is_long ? CLZ(static_cast<uint64_t>(value)) : CLZ(static_cast<uint32_t>(value));
2540     }
2541     codegen->Load32BitValue(out, value);
2542     return;
2543   }
2544 
2545   // Handle the non-constant cases.
2546   if (src.IsRegister()) {
2547     if (is_long) {
2548       __ bsrq(out, src.AsRegister<CpuRegister>());
2549     } else {
2550       __ bsrl(out, src.AsRegister<CpuRegister>());
2551     }
2552   } else if (is_long) {
2553     DCHECK(src.IsDoubleStackSlot());
2554     __ bsrq(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2555   } else {
2556     DCHECK(src.IsStackSlot());
2557     __ bsrl(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2558   }
2559 
2560   // BSR sets ZF if the input was zero, and the output is undefined.
2561   NearLabel is_zero, done;
2562   __ j(kEqual, &is_zero);
2563 
2564   // Correct the result from BSR to get the CLZ result.
2565   __ xorl(out, Immediate(zero_value_result - 1));
2566   __ jmp(&done);
2567 
2568   // Fix the zero case with the expected result.
2569   __ Bind(&is_zero);
2570   __ movl(out, Immediate(zero_value_result));
2571 
2572   __ Bind(&done);
2573 }
2574 
2575 void IntrinsicLocationsBuilderX86_64::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
2576   CreateLeadingZeroLocations(allocator_, invoke);
2577 }
2578 
2579 void IntrinsicCodeGeneratorX86_64::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
2580   GenLeadingZeros(GetAssembler(), codegen_, invoke, /* is_long= */ false);
2581 }
2582 
2583 void IntrinsicLocationsBuilderX86_64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
2584   CreateLeadingZeroLocations(allocator_, invoke);
2585 }
2586 
2587 void IntrinsicCodeGeneratorX86_64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
2588   GenLeadingZeros(GetAssembler(), codegen_, invoke, /* is_long= */ true);
2589 }
2590 
2591 static void CreateTrailingZeroLocations(ArenaAllocator* allocator, HInvoke* invoke) {
2592   LocationSummary* locations =
2593       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2594   locations->SetInAt(0, Location::Any());
2595   locations->SetOut(Location::RequiresRegister());
2596 }
2597 
2598 static void GenTrailingZeros(X86_64Assembler* assembler,
2599                              CodeGeneratorX86_64* codegen,
2600                              HInvoke* invoke, bool is_long) {
2601   LocationSummary* locations = invoke->GetLocations();
2602   Location src = locations->InAt(0);
2603   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
2604 
2605   int zero_value_result = is_long ? 64 : 32;
2606   if (invoke->InputAt(0)->IsConstant()) {
2607     // Evaluate this at compile time.
2608     int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
2609     if (value == 0) {
2610       value = zero_value_result;
2611     } else {
2612       value = is_long ? CTZ(static_cast<uint64_t>(value)) : CTZ(static_cast<uint32_t>(value));
2613     }
2614     codegen->Load32BitValue(out, value);
2615     return;
2616   }
2617 
2618   // Handle the non-constant cases.
2619   if (src.IsRegister()) {
2620     if (is_long) {
2621       __ bsfq(out, src.AsRegister<CpuRegister>());
2622     } else {
2623       __ bsfl(out, src.AsRegister<CpuRegister>());
2624     }
2625   } else if (is_long) {
2626     DCHECK(src.IsDoubleStackSlot());
2627     __ bsfq(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2628   } else {
2629     DCHECK(src.IsStackSlot());
2630     __ bsfl(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2631   }
2632 
2633   // BSF sets ZF if the input was zero, and the output is undefined.
2634   NearLabel done;
2635   __ j(kNotEqual, &done);
2636 
2637   // Fix the zero case with the expected result.
2638   __ movl(out, Immediate(zero_value_result));
2639 
2640   __ Bind(&done);
2641 }
2642 
2643 void IntrinsicLocationsBuilderX86_64::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
2644   CreateTrailingZeroLocations(allocator_, invoke);
2645 }
2646 
2647 void IntrinsicCodeGeneratorX86_64::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
2648   GenTrailingZeros(GetAssembler(), codegen_, invoke, /* is_long= */ false);
2649 }
2650 
2651 void IntrinsicLocationsBuilderX86_64::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
2652   CreateTrailingZeroLocations(allocator_, invoke);
2653 }
2654 
2655 void IntrinsicCodeGeneratorX86_64::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
2656   GenTrailingZeros(GetAssembler(), codegen_, invoke, /* is_long= */ true);
2657 }
2658 
2659 void IntrinsicLocationsBuilderX86_64::VisitIntegerValueOf(HInvoke* invoke) {
2660   InvokeRuntimeCallingConvention calling_convention;
2661   IntrinsicVisitor::ComputeIntegerValueOfLocations(
2662       invoke,
2663       codegen_,
2664       Location::RegisterLocation(RAX),
2665       Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
2666 }
2667 
2668 void IntrinsicCodeGeneratorX86_64::VisitIntegerValueOf(HInvoke* invoke) {
2669   IntrinsicVisitor::IntegerValueOfInfo info =
2670       IntrinsicVisitor::ComputeIntegerValueOfInfo(invoke, codegen_->GetCompilerOptions());
2671   LocationSummary* locations = invoke->GetLocations();
2672   X86_64Assembler* assembler = GetAssembler();
2673 
2674   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
2675   InvokeRuntimeCallingConvention calling_convention;
2676   CpuRegister argument = CpuRegister(calling_convention.GetRegisterAt(0));
2677   if (invoke->InputAt(0)->IsIntConstant()) {
2678     int32_t value = invoke->InputAt(0)->AsIntConstant()->GetValue();
2679     if (static_cast<uint32_t>(value - info.low) < info.length) {
2680       // Just embed the j.l.Integer in the code.
2681       DCHECK_NE(info.value_boot_image_reference, IntegerValueOfInfo::kInvalidReference);
2682       codegen_->LoadBootImageAddress(out, info.value_boot_image_reference);
2683     } else {
2684       DCHECK(locations->CanCall());
2685       // Allocate and initialize a new j.l.Integer.
2686       // TODO: If we JIT, we could allocate the j.l.Integer now, and store it in the
2687       // JIT object table.
2688       codegen_->AllocateInstanceForIntrinsic(invoke->AsInvokeStaticOrDirect(),
2689                                              info.integer_boot_image_offset);
2690       __ movl(Address(out, info.value_offset), Immediate(value));
2691     }
2692   } else {
2693     DCHECK(locations->CanCall());
2694     CpuRegister in = locations->InAt(0).AsRegister<CpuRegister>();
2695     // Check bounds of our cache.
2696     __ leal(out, Address(in, -info.low));
2697     __ cmpl(out, Immediate(info.length));
2698     NearLabel allocate, done;
2699     __ j(kAboveEqual, &allocate);
2700     // If the value is within the bounds, load the j.l.Integer directly from the array.
2701     DCHECK_NE(out.AsRegister(), argument.AsRegister());
2702     codegen_->LoadBootImageAddress(argument, info.array_data_boot_image_reference);
2703     static_assert((1u << TIMES_4) == sizeof(mirror::HeapReference<mirror::Object>),
2704                   "Check heap reference size.");
2705     __ movl(out, Address(argument, out, TIMES_4, 0));
2706     __ MaybeUnpoisonHeapReference(out);
2707     __ jmp(&done);
2708     __ Bind(&allocate);
2709     // Otherwise allocate and initialize a new j.l.Integer.
2710     codegen_->AllocateInstanceForIntrinsic(invoke->AsInvokeStaticOrDirect(),
2711                                            info.integer_boot_image_offset);
2712     __ movl(Address(out, info.value_offset), in);
2713     __ Bind(&done);
2714   }
2715 }
2716 
2717 void IntrinsicLocationsBuilderX86_64::VisitThreadInterrupted(HInvoke* invoke) {
2718   LocationSummary* locations =
2719       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2720   locations->SetOut(Location::RequiresRegister());
2721 }
2722 
2723 void IntrinsicCodeGeneratorX86_64::VisitThreadInterrupted(HInvoke* invoke) {
2724   X86_64Assembler* assembler = GetAssembler();
2725   CpuRegister out = invoke->GetLocations()->Out().AsRegister<CpuRegister>();
2726   Address address = Address::Absolute
2727       (Thread::InterruptedOffset<kX86_64PointerSize>().Int32Value(), /* no_rip= */ true);
2728   NearLabel done;
2729   __ gs()->movl(out, address);
2730   __ testl(out, out);
2731   __ j(kEqual, &done);
2732   __ gs()->movl(address, Immediate(0));
2733   codegen_->MemoryFence();
2734   __ Bind(&done);
2735 }
2736 
2737 void IntrinsicLocationsBuilderX86_64::VisitReachabilityFence(HInvoke* invoke) {
2738   LocationSummary* locations =
2739       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2740   locations->SetInAt(0, Location::Any());
2741 }
2742 
2743 void IntrinsicCodeGeneratorX86_64::VisitReachabilityFence(HInvoke* invoke ATTRIBUTE_UNUSED) { }
2744 
2745 UNIMPLEMENTED_INTRINSIC(X86_64, ReferenceGetReferent)
2746 UNIMPLEMENTED_INTRINSIC(X86_64, FloatIsInfinite)
2747 UNIMPLEMENTED_INTRINSIC(X86_64, DoubleIsInfinite)
2748 UNIMPLEMENTED_INTRINSIC(X86_64, CRC32Update)
2749 UNIMPLEMENTED_INTRINSIC(X86_64, CRC32UpdateBytes)
2750 UNIMPLEMENTED_INTRINSIC(X86_64, CRC32UpdateByteBuffer)
2751 UNIMPLEMENTED_INTRINSIC(X86_64, FP16ToFloat)
2752 UNIMPLEMENTED_INTRINSIC(X86_64, FP16ToHalf)
2753 UNIMPLEMENTED_INTRINSIC(X86_64, FP16Floor)
2754 UNIMPLEMENTED_INTRINSIC(X86_64, FP16Ceil)
2755 UNIMPLEMENTED_INTRINSIC(X86_64, FP16Rint)
2756 UNIMPLEMENTED_INTRINSIC(X86_64, FP16Greater)
2757 UNIMPLEMENTED_INTRINSIC(X86_64, FP16GreaterEquals)
2758 UNIMPLEMENTED_INTRINSIC(X86_64, FP16Less)
2759 UNIMPLEMENTED_INTRINSIC(X86_64, FP16LessEquals)
2760 
2761 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOf);
2762 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOfAfter);
2763 UNIMPLEMENTED_INTRINSIC(X86_64, StringBufferAppend);
2764 UNIMPLEMENTED_INTRINSIC(X86_64, StringBufferLength);
2765 UNIMPLEMENTED_INTRINSIC(X86_64, StringBufferToString);
2766 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderAppendObject);
2767 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderAppendString);
2768 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderAppendCharSequence);
2769 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderAppendCharArray);
2770 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderAppendBoolean);
2771 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderAppendChar);
2772 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderAppendInt);
2773 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderAppendLong);
2774 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderAppendFloat);
2775 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderAppendDouble);
2776 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderLength);
2777 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderToString);
2778 
2779 // 1.8.
2780 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndAddInt)
2781 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndAddLong)
2782 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndSetInt)
2783 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndSetLong)
2784 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndSetObject)
2785 
2786 UNREACHABLE_INTRINSICS(X86_64)
2787 
2788 #undef __
2789 
2790 }  // namespace x86_64
2791 }  // namespace art
2792