1 /* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "intrinsics_x86_64.h" 18 19 #include <limits> 20 21 #include "arch/x86_64/instruction_set_features_x86_64.h" 22 #include "art_method.h" 23 #include "base/bit_utils.h" 24 #include "code_generator_x86_64.h" 25 #include "entrypoints/quick/quick_entrypoints.h" 26 #include "heap_poisoning.h" 27 #include "intrinsics.h" 28 #include "intrinsics_utils.h" 29 #include "lock_word.h" 30 #include "mirror/array-inl.h" 31 #include "mirror/object_array-inl.h" 32 #include "mirror/reference.h" 33 #include "mirror/string.h" 34 #include "scoped_thread_state_change-inl.h" 35 #include "thread-current-inl.h" 36 #include "utils/x86_64/assembler_x86_64.h" 37 #include "utils/x86_64/constants_x86_64.h" 38 39 namespace art { 40 41 namespace x86_64 { 42 43 IntrinsicLocationsBuilderX86_64::IntrinsicLocationsBuilderX86_64(CodeGeneratorX86_64* codegen) 44 : allocator_(codegen->GetGraph()->GetAllocator()), codegen_(codegen) { 45 } 46 47 X86_64Assembler* IntrinsicCodeGeneratorX86_64::GetAssembler() { 48 return down_cast<X86_64Assembler*>(codegen_->GetAssembler()); 49 } 50 51 ArenaAllocator* IntrinsicCodeGeneratorX86_64::GetAllocator() { 52 return codegen_->GetGraph()->GetAllocator(); 53 } 54 55 bool IntrinsicLocationsBuilderX86_64::TryDispatch(HInvoke* invoke) { 56 Dispatch(invoke); 57 LocationSummary* res = invoke->GetLocations(); 58 if (res == nullptr) { 59 return false; 60 } 61 return res->Intrinsified(); 62 } 63 64 static void MoveArguments(HInvoke* invoke, CodeGeneratorX86_64* codegen) { 65 InvokeDexCallingConventionVisitorX86_64 calling_convention_visitor; 66 IntrinsicVisitor::MoveArguments(invoke, codegen, &calling_convention_visitor); 67 } 68 69 using IntrinsicSlowPathX86_64 = IntrinsicSlowPath<InvokeDexCallingConventionVisitorX86_64>; 70 71 // NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. 72 #define __ down_cast<X86_64Assembler*>(codegen->GetAssembler())-> // NOLINT 73 74 // Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers. 75 class ReadBarrierSystemArrayCopySlowPathX86_64 : public SlowPathCode { 76 public: 77 explicit ReadBarrierSystemArrayCopySlowPathX86_64(HInstruction* instruction) 78 : SlowPathCode(instruction) { 79 DCHECK(kEmitCompilerReadBarrier); 80 DCHECK(kUseBakerReadBarrier); 81 } 82 83 void EmitNativeCode(CodeGenerator* codegen) override { 84 CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen); 85 LocationSummary* locations = instruction_->GetLocations(); 86 DCHECK(locations->CanCall()); 87 DCHECK(instruction_->IsInvokeStaticOrDirect()) 88 << "Unexpected instruction in read barrier arraycopy slow path: " 89 << instruction_->DebugName(); 90 DCHECK(instruction_->GetLocations()->Intrinsified()); 91 DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy); 92 93 int32_t element_size = DataType::Size(DataType::Type::kReference); 94 95 CpuRegister src_curr_addr = locations->GetTemp(0).AsRegister<CpuRegister>(); 96 CpuRegister dst_curr_addr = locations->GetTemp(1).AsRegister<CpuRegister>(); 97 CpuRegister src_stop_addr = locations->GetTemp(2).AsRegister<CpuRegister>(); 98 99 __ Bind(GetEntryLabel()); 100 NearLabel loop; 101 __ Bind(&loop); 102 __ movl(CpuRegister(TMP), Address(src_curr_addr, 0)); 103 __ MaybeUnpoisonHeapReference(CpuRegister(TMP)); 104 // TODO: Inline the mark bit check before calling the runtime? 105 // TMP = ReadBarrier::Mark(TMP); 106 // No need to save live registers; it's taken care of by the 107 // entrypoint. Also, there is no need to update the stack mask, 108 // as this runtime call will not trigger a garbage collection. 109 int32_t entry_point_offset = Thread::ReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(TMP); 110 // This runtime call does not require a stack map. 111 x86_64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); 112 __ MaybePoisonHeapReference(CpuRegister(TMP)); 113 __ movl(Address(dst_curr_addr, 0), CpuRegister(TMP)); 114 __ addl(src_curr_addr, Immediate(element_size)); 115 __ addl(dst_curr_addr, Immediate(element_size)); 116 __ cmpl(src_curr_addr, src_stop_addr); 117 __ j(kNotEqual, &loop); 118 __ jmp(GetExitLabel()); 119 } 120 121 const char* GetDescription() const override { return "ReadBarrierSystemArrayCopySlowPathX86_64"; } 122 123 private: 124 DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathX86_64); 125 }; 126 127 #undef __ 128 129 #define __ assembler-> 130 131 static void CreateFPToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) { 132 LocationSummary* locations = 133 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); 134 locations->SetInAt(0, Location::RequiresFpuRegister()); 135 locations->SetOut(Location::RequiresRegister()); 136 } 137 138 static void CreateIntToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) { 139 LocationSummary* locations = 140 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); 141 locations->SetInAt(0, Location::RequiresRegister()); 142 locations->SetOut(Location::RequiresFpuRegister()); 143 } 144 145 static void MoveFPToInt(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) { 146 Location input = locations->InAt(0); 147 Location output = locations->Out(); 148 __ movd(output.AsRegister<CpuRegister>(), input.AsFpuRegister<XmmRegister>(), is64bit); 149 } 150 151 static void MoveIntToFP(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) { 152 Location input = locations->InAt(0); 153 Location output = locations->Out(); 154 __ movd(output.AsFpuRegister<XmmRegister>(), input.AsRegister<CpuRegister>(), is64bit); 155 } 156 157 void IntrinsicLocationsBuilderX86_64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) { 158 CreateFPToIntLocations(allocator_, invoke); 159 } 160 void IntrinsicLocationsBuilderX86_64::VisitDoubleLongBitsToDouble(HInvoke* invoke) { 161 CreateIntToFPLocations(allocator_, invoke); 162 } 163 164 void IntrinsicCodeGeneratorX86_64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) { 165 MoveFPToInt(invoke->GetLocations(), /* is64bit= */ true, GetAssembler()); 166 } 167 void IntrinsicCodeGeneratorX86_64::VisitDoubleLongBitsToDouble(HInvoke* invoke) { 168 MoveIntToFP(invoke->GetLocations(), /* is64bit= */ true, GetAssembler()); 169 } 170 171 void IntrinsicLocationsBuilderX86_64::VisitFloatFloatToRawIntBits(HInvoke* invoke) { 172 CreateFPToIntLocations(allocator_, invoke); 173 } 174 void IntrinsicLocationsBuilderX86_64::VisitFloatIntBitsToFloat(HInvoke* invoke) { 175 CreateIntToFPLocations(allocator_, invoke); 176 } 177 178 void IntrinsicCodeGeneratorX86_64::VisitFloatFloatToRawIntBits(HInvoke* invoke) { 179 MoveFPToInt(invoke->GetLocations(), /* is64bit= */ false, GetAssembler()); 180 } 181 void IntrinsicCodeGeneratorX86_64::VisitFloatIntBitsToFloat(HInvoke* invoke) { 182 MoveIntToFP(invoke->GetLocations(), /* is64bit= */ false, GetAssembler()); 183 } 184 185 static void CreateIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) { 186 LocationSummary* locations = 187 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); 188 locations->SetInAt(0, Location::RequiresRegister()); 189 locations->SetOut(Location::SameAsFirstInput()); 190 } 191 192 static void GenReverseBytes(LocationSummary* locations, 193 DataType::Type size, 194 X86_64Assembler* assembler) { 195 CpuRegister out = locations->Out().AsRegister<CpuRegister>(); 196 197 switch (size) { 198 case DataType::Type::kInt16: 199 // TODO: Can be done with an xchg of 8b registers. This is straight from Quick. 200 __ bswapl(out); 201 __ sarl(out, Immediate(16)); 202 break; 203 case DataType::Type::kInt32: 204 __ bswapl(out); 205 break; 206 case DataType::Type::kInt64: 207 __ bswapq(out); 208 break; 209 default: 210 LOG(FATAL) << "Unexpected size for reverse-bytes: " << size; 211 UNREACHABLE(); 212 } 213 } 214 215 void IntrinsicLocationsBuilderX86_64::VisitIntegerReverseBytes(HInvoke* invoke) { 216 CreateIntToIntLocations(allocator_, invoke); 217 } 218 219 void IntrinsicCodeGeneratorX86_64::VisitIntegerReverseBytes(HInvoke* invoke) { 220 GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt32, GetAssembler()); 221 } 222 223 void IntrinsicLocationsBuilderX86_64::VisitLongReverseBytes(HInvoke* invoke) { 224 CreateIntToIntLocations(allocator_, invoke); 225 } 226 227 void IntrinsicCodeGeneratorX86_64::VisitLongReverseBytes(HInvoke* invoke) { 228 GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt64, GetAssembler()); 229 } 230 231 void IntrinsicLocationsBuilderX86_64::VisitShortReverseBytes(HInvoke* invoke) { 232 CreateIntToIntLocations(allocator_, invoke); 233 } 234 235 void IntrinsicCodeGeneratorX86_64::VisitShortReverseBytes(HInvoke* invoke) { 236 GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt16, GetAssembler()); 237 } 238 239 static void CreateFPToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) { 240 LocationSummary* locations = 241 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); 242 locations->SetInAt(0, Location::RequiresFpuRegister()); 243 locations->SetOut(Location::RequiresFpuRegister()); 244 } 245 246 void IntrinsicLocationsBuilderX86_64::VisitMathSqrt(HInvoke* invoke) { 247 CreateFPToFPLocations(allocator_, invoke); 248 } 249 250 void IntrinsicCodeGeneratorX86_64::VisitMathSqrt(HInvoke* invoke) { 251 LocationSummary* locations = invoke->GetLocations(); 252 XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>(); 253 XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>(); 254 255 GetAssembler()->sqrtsd(out, in); 256 } 257 258 static void InvokeOutOfLineIntrinsic(CodeGeneratorX86_64* codegen, HInvoke* invoke) { 259 MoveArguments(invoke, codegen); 260 261 DCHECK(invoke->IsInvokeStaticOrDirect()); 262 codegen->GenerateStaticOrDirectCall( 263 invoke->AsInvokeStaticOrDirect(), Location::RegisterLocation(RDI)); 264 265 // Copy the result back to the expected output. 266 Location out = invoke->GetLocations()->Out(); 267 if (out.IsValid()) { 268 DCHECK(out.IsRegister()); 269 codegen->MoveFromReturnRegister(out, invoke->GetType()); 270 } 271 } 272 273 static void CreateSSE41FPToFPLocations(ArenaAllocator* allocator, 274 HInvoke* invoke, 275 CodeGeneratorX86_64* codegen) { 276 // Do we have instruction support? 277 if (codegen->GetInstructionSetFeatures().HasSSE4_1()) { 278 CreateFPToFPLocations(allocator, invoke); 279 return; 280 } 281 282 // We have to fall back to a call to the intrinsic. 283 LocationSummary* locations = 284 new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly); 285 InvokeRuntimeCallingConvention calling_convention; 286 locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0))); 287 locations->SetOut(Location::FpuRegisterLocation(XMM0)); 288 // Needs to be RDI for the invoke. 289 locations->AddTemp(Location::RegisterLocation(RDI)); 290 } 291 292 static void GenSSE41FPToFPIntrinsic(CodeGeneratorX86_64* codegen, 293 HInvoke* invoke, 294 X86_64Assembler* assembler, 295 int round_mode) { 296 LocationSummary* locations = invoke->GetLocations(); 297 if (locations->WillCall()) { 298 InvokeOutOfLineIntrinsic(codegen, invoke); 299 } else { 300 XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>(); 301 XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>(); 302 __ roundsd(out, in, Immediate(round_mode)); 303 } 304 } 305 306 void IntrinsicLocationsBuilderX86_64::VisitMathCeil(HInvoke* invoke) { 307 CreateSSE41FPToFPLocations(allocator_, invoke, codegen_); 308 } 309 310 void IntrinsicCodeGeneratorX86_64::VisitMathCeil(HInvoke* invoke) { 311 GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 2); 312 } 313 314 void IntrinsicLocationsBuilderX86_64::VisitMathFloor(HInvoke* invoke) { 315 CreateSSE41FPToFPLocations(allocator_, invoke, codegen_); 316 } 317 318 void IntrinsicCodeGeneratorX86_64::VisitMathFloor(HInvoke* invoke) { 319 GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 1); 320 } 321 322 void IntrinsicLocationsBuilderX86_64::VisitMathRint(HInvoke* invoke) { 323 CreateSSE41FPToFPLocations(allocator_, invoke, codegen_); 324 } 325 326 void IntrinsicCodeGeneratorX86_64::VisitMathRint(HInvoke* invoke) { 327 GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 0); 328 } 329 330 static void CreateSSE41FPToIntLocations(ArenaAllocator* allocator, 331 HInvoke* invoke, 332 CodeGeneratorX86_64* codegen) { 333 // Do we have instruction support? 334 if (codegen->GetInstructionSetFeatures().HasSSE4_1()) { 335 LocationSummary* locations = 336 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); 337 locations->SetInAt(0, Location::RequiresFpuRegister()); 338 locations->SetOut(Location::RequiresRegister()); 339 locations->AddTemp(Location::RequiresFpuRegister()); 340 locations->AddTemp(Location::RequiresFpuRegister()); 341 return; 342 } 343 344 // We have to fall back to a call to the intrinsic. 345 LocationSummary* locations = 346 new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly); 347 InvokeRuntimeCallingConvention calling_convention; 348 locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0))); 349 locations->SetOut(Location::RegisterLocation(RAX)); 350 // Needs to be RDI for the invoke. 351 locations->AddTemp(Location::RegisterLocation(RDI)); 352 } 353 354 void IntrinsicLocationsBuilderX86_64::VisitMathRoundFloat(HInvoke* invoke) { 355 CreateSSE41FPToIntLocations(allocator_, invoke, codegen_); 356 } 357 358 void IntrinsicCodeGeneratorX86_64::VisitMathRoundFloat(HInvoke* invoke) { 359 LocationSummary* locations = invoke->GetLocations(); 360 if (locations->WillCall()) { 361 InvokeOutOfLineIntrinsic(codegen_, invoke); 362 return; 363 } 364 365 XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>(); 366 CpuRegister out = locations->Out().AsRegister<CpuRegister>(); 367 XmmRegister t1 = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); 368 XmmRegister t2 = locations->GetTemp(1).AsFpuRegister<XmmRegister>(); 369 NearLabel skip_incr, done; 370 X86_64Assembler* assembler = GetAssembler(); 371 372 // Since no direct x86 rounding instruction matches the required semantics, 373 // this intrinsic is implemented as follows: 374 // result = floor(in); 375 // if (in - result >= 0.5f) 376 // result = result + 1.0f; 377 __ movss(t2, in); 378 __ roundss(t1, in, Immediate(1)); 379 __ subss(t2, t1); 380 __ comiss(t2, codegen_->LiteralFloatAddress(0.5f)); 381 __ j(kBelow, &skip_incr); 382 __ addss(t1, codegen_->LiteralFloatAddress(1.0f)); 383 __ Bind(&skip_incr); 384 385 // Final conversion to an integer. Unfortunately this also does not have a 386 // direct x86 instruction, since NaN should map to 0 and large positive 387 // values need to be clipped to the extreme value. 388 codegen_->Load32BitValue(out, kPrimIntMax); 389 __ cvtsi2ss(t2, out); 390 __ comiss(t1, t2); 391 __ j(kAboveEqual, &done); // clipped to max (already in out), does not jump on unordered 392 __ movl(out, Immediate(0)); // does not change flags 393 __ j(kUnordered, &done); // NaN mapped to 0 (just moved in out) 394 __ cvttss2si(out, t1); 395 __ Bind(&done); 396 } 397 398 void IntrinsicLocationsBuilderX86_64::VisitMathRoundDouble(HInvoke* invoke) { 399 CreateSSE41FPToIntLocations(allocator_, invoke, codegen_); 400 } 401 402 void IntrinsicCodeGeneratorX86_64::VisitMathRoundDouble(HInvoke* invoke) { 403 LocationSummary* locations = invoke->GetLocations(); 404 if (locations->WillCall()) { 405 InvokeOutOfLineIntrinsic(codegen_, invoke); 406 return; 407 } 408 409 XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>(); 410 CpuRegister out = locations->Out().AsRegister<CpuRegister>(); 411 XmmRegister t1 = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); 412 XmmRegister t2 = locations->GetTemp(1).AsFpuRegister<XmmRegister>(); 413 NearLabel skip_incr, done; 414 X86_64Assembler* assembler = GetAssembler(); 415 416 // Since no direct x86 rounding instruction matches the required semantics, 417 // this intrinsic is implemented as follows: 418 // result = floor(in); 419 // if (in - result >= 0.5) 420 // result = result + 1.0f; 421 __ movsd(t2, in); 422 __ roundsd(t1, in, Immediate(1)); 423 __ subsd(t2, t1); 424 __ comisd(t2, codegen_->LiteralDoubleAddress(0.5)); 425 __ j(kBelow, &skip_incr); 426 __ addsd(t1, codegen_->LiteralDoubleAddress(1.0f)); 427 __ Bind(&skip_incr); 428 429 // Final conversion to an integer. Unfortunately this also does not have a 430 // direct x86 instruction, since NaN should map to 0 and large positive 431 // values need to be clipped to the extreme value. 432 codegen_->Load64BitValue(out, kPrimLongMax); 433 __ cvtsi2sd(t2, out, /* is64bit= */ true); 434 __ comisd(t1, t2); 435 __ j(kAboveEqual, &done); // clipped to max (already in out), does not jump on unordered 436 __ movl(out, Immediate(0)); // does not change flags, implicit zero extension to 64-bit 437 __ j(kUnordered, &done); // NaN mapped to 0 (just moved in out) 438 __ cvttsd2si(out, t1, /* is64bit= */ true); 439 __ Bind(&done); 440 } 441 442 static void CreateFPToFPCallLocations(ArenaAllocator* allocator, HInvoke* invoke) { 443 LocationSummary* locations = 444 new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified); 445 InvokeRuntimeCallingConvention calling_convention; 446 locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0))); 447 locations->SetOut(Location::FpuRegisterLocation(XMM0)); 448 449 // We have to ensure that the native code doesn't clobber the XMM registers which are 450 // non-volatile for ART, but volatile for Native calls. This will ensure that they are 451 // saved in the prologue and properly restored. 452 for (FloatRegister fp_reg : non_volatile_xmm_regs) { 453 locations->AddTemp(Location::FpuRegisterLocation(fp_reg)); 454 } 455 } 456 457 static void GenFPToFPCall(HInvoke* invoke, CodeGeneratorX86_64* codegen, 458 QuickEntrypointEnum entry) { 459 LocationSummary* locations = invoke->GetLocations(); 460 DCHECK(locations->WillCall()); 461 DCHECK(invoke->IsInvokeStaticOrDirect()); 462 463 codegen->InvokeRuntime(entry, invoke, invoke->GetDexPc()); 464 } 465 466 void IntrinsicLocationsBuilderX86_64::VisitMathCos(HInvoke* invoke) { 467 CreateFPToFPCallLocations(allocator_, invoke); 468 } 469 470 void IntrinsicCodeGeneratorX86_64::VisitMathCos(HInvoke* invoke) { 471 GenFPToFPCall(invoke, codegen_, kQuickCos); 472 } 473 474 void IntrinsicLocationsBuilderX86_64::VisitMathSin(HInvoke* invoke) { 475 CreateFPToFPCallLocations(allocator_, invoke); 476 } 477 478 void IntrinsicCodeGeneratorX86_64::VisitMathSin(HInvoke* invoke) { 479 GenFPToFPCall(invoke, codegen_, kQuickSin); 480 } 481 482 void IntrinsicLocationsBuilderX86_64::VisitMathAcos(HInvoke* invoke) { 483 CreateFPToFPCallLocations(allocator_, invoke); 484 } 485 486 void IntrinsicCodeGeneratorX86_64::VisitMathAcos(HInvoke* invoke) { 487 GenFPToFPCall(invoke, codegen_, kQuickAcos); 488 } 489 490 void IntrinsicLocationsBuilderX86_64::VisitMathAsin(HInvoke* invoke) { 491 CreateFPToFPCallLocations(allocator_, invoke); 492 } 493 494 void IntrinsicCodeGeneratorX86_64::VisitMathAsin(HInvoke* invoke) { 495 GenFPToFPCall(invoke, codegen_, kQuickAsin); 496 } 497 498 void IntrinsicLocationsBuilderX86_64::VisitMathAtan(HInvoke* invoke) { 499 CreateFPToFPCallLocations(allocator_, invoke); 500 } 501 502 void IntrinsicCodeGeneratorX86_64::VisitMathAtan(HInvoke* invoke) { 503 GenFPToFPCall(invoke, codegen_, kQuickAtan); 504 } 505 506 void IntrinsicLocationsBuilderX86_64::VisitMathCbrt(HInvoke* invoke) { 507 CreateFPToFPCallLocations(allocator_, invoke); 508 } 509 510 void IntrinsicCodeGeneratorX86_64::VisitMathCbrt(HInvoke* invoke) { 511 GenFPToFPCall(invoke, codegen_, kQuickCbrt); 512 } 513 514 void IntrinsicLocationsBuilderX86_64::VisitMathCosh(HInvoke* invoke) { 515 CreateFPToFPCallLocations(allocator_, invoke); 516 } 517 518 void IntrinsicCodeGeneratorX86_64::VisitMathCosh(HInvoke* invoke) { 519 GenFPToFPCall(invoke, codegen_, kQuickCosh); 520 } 521 522 void IntrinsicLocationsBuilderX86_64::VisitMathExp(HInvoke* invoke) { 523 CreateFPToFPCallLocations(allocator_, invoke); 524 } 525 526 void IntrinsicCodeGeneratorX86_64::VisitMathExp(HInvoke* invoke) { 527 GenFPToFPCall(invoke, codegen_, kQuickExp); 528 } 529 530 void IntrinsicLocationsBuilderX86_64::VisitMathExpm1(HInvoke* invoke) { 531 CreateFPToFPCallLocations(allocator_, invoke); 532 } 533 534 void IntrinsicCodeGeneratorX86_64::VisitMathExpm1(HInvoke* invoke) { 535 GenFPToFPCall(invoke, codegen_, kQuickExpm1); 536 } 537 538 void IntrinsicLocationsBuilderX86_64::VisitMathLog(HInvoke* invoke) { 539 CreateFPToFPCallLocations(allocator_, invoke); 540 } 541 542 void IntrinsicCodeGeneratorX86_64::VisitMathLog(HInvoke* invoke) { 543 GenFPToFPCall(invoke, codegen_, kQuickLog); 544 } 545 546 void IntrinsicLocationsBuilderX86_64::VisitMathLog10(HInvoke* invoke) { 547 CreateFPToFPCallLocations(allocator_, invoke); 548 } 549 550 void IntrinsicCodeGeneratorX86_64::VisitMathLog10(HInvoke* invoke) { 551 GenFPToFPCall(invoke, codegen_, kQuickLog10); 552 } 553 554 void IntrinsicLocationsBuilderX86_64::VisitMathSinh(HInvoke* invoke) { 555 CreateFPToFPCallLocations(allocator_, invoke); 556 } 557 558 void IntrinsicCodeGeneratorX86_64::VisitMathSinh(HInvoke* invoke) { 559 GenFPToFPCall(invoke, codegen_, kQuickSinh); 560 } 561 562 void IntrinsicLocationsBuilderX86_64::VisitMathTan(HInvoke* invoke) { 563 CreateFPToFPCallLocations(allocator_, invoke); 564 } 565 566 void IntrinsicCodeGeneratorX86_64::VisitMathTan(HInvoke* invoke) { 567 GenFPToFPCall(invoke, codegen_, kQuickTan); 568 } 569 570 void IntrinsicLocationsBuilderX86_64::VisitMathTanh(HInvoke* invoke) { 571 CreateFPToFPCallLocations(allocator_, invoke); 572 } 573 574 void IntrinsicCodeGeneratorX86_64::VisitMathTanh(HInvoke* invoke) { 575 GenFPToFPCall(invoke, codegen_, kQuickTanh); 576 } 577 578 static void CreateFPFPToFPCallLocations(ArenaAllocator* allocator, HInvoke* invoke) { 579 LocationSummary* locations = 580 new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified); 581 InvokeRuntimeCallingConvention calling_convention; 582 locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0))); 583 locations->SetInAt(1, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(1))); 584 locations->SetOut(Location::FpuRegisterLocation(XMM0)); 585 586 // We have to ensure that the native code doesn't clobber the XMM registers which are 587 // non-volatile for ART, but volatile for Native calls. This will ensure that they are 588 // saved in the prologue and properly restored. 589 for (FloatRegister fp_reg : non_volatile_xmm_regs) { 590 locations->AddTemp(Location::FpuRegisterLocation(fp_reg)); 591 } 592 } 593 594 void IntrinsicLocationsBuilderX86_64::VisitMathAtan2(HInvoke* invoke) { 595 CreateFPFPToFPCallLocations(allocator_, invoke); 596 } 597 598 void IntrinsicCodeGeneratorX86_64::VisitMathAtan2(HInvoke* invoke) { 599 GenFPToFPCall(invoke, codegen_, kQuickAtan2); 600 } 601 602 void IntrinsicLocationsBuilderX86_64::VisitMathPow(HInvoke* invoke) { 603 CreateFPFPToFPCallLocations(allocator_, invoke); 604 } 605 606 void IntrinsicCodeGeneratorX86_64::VisitMathPow(HInvoke* invoke) { 607 GenFPToFPCall(invoke, codegen_, kQuickPow); 608 } 609 610 void IntrinsicLocationsBuilderX86_64::VisitMathHypot(HInvoke* invoke) { 611 CreateFPFPToFPCallLocations(allocator_, invoke); 612 } 613 614 void IntrinsicCodeGeneratorX86_64::VisitMathHypot(HInvoke* invoke) { 615 GenFPToFPCall(invoke, codegen_, kQuickHypot); 616 } 617 618 void IntrinsicLocationsBuilderX86_64::VisitMathNextAfter(HInvoke* invoke) { 619 CreateFPFPToFPCallLocations(allocator_, invoke); 620 } 621 622 void IntrinsicCodeGeneratorX86_64::VisitMathNextAfter(HInvoke* invoke) { 623 GenFPToFPCall(invoke, codegen_, kQuickNextAfter); 624 } 625 626 void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) { 627 // Check to see if we have known failures that will cause us to have to bail out 628 // to the runtime, and just generate the runtime call directly. 629 HIntConstant* src_pos = invoke->InputAt(1)->AsIntConstant(); 630 HIntConstant* dest_pos = invoke->InputAt(3)->AsIntConstant(); 631 632 // The positions must be non-negative. 633 if ((src_pos != nullptr && src_pos->GetValue() < 0) || 634 (dest_pos != nullptr && dest_pos->GetValue() < 0)) { 635 // We will have to fail anyways. 636 return; 637 } 638 639 // The length must be > 0. 640 HIntConstant* length = invoke->InputAt(4)->AsIntConstant(); 641 if (length != nullptr) { 642 int32_t len = length->GetValue(); 643 if (len < 0) { 644 // Just call as normal. 645 return; 646 } 647 } 648 649 LocationSummary* locations = 650 new (allocator_) LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified); 651 // arraycopy(Object src, int src_pos, Object dest, int dest_pos, int length). 652 locations->SetInAt(0, Location::RequiresRegister()); 653 locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1))); 654 locations->SetInAt(2, Location::RequiresRegister()); 655 locations->SetInAt(3, Location::RegisterOrConstant(invoke->InputAt(3))); 656 locations->SetInAt(4, Location::RegisterOrConstant(invoke->InputAt(4))); 657 658 // And we need some temporaries. We will use REP MOVSW, so we need fixed registers. 659 locations->AddTemp(Location::RegisterLocation(RSI)); 660 locations->AddTemp(Location::RegisterLocation(RDI)); 661 locations->AddTemp(Location::RegisterLocation(RCX)); 662 } 663 664 static void CheckPosition(X86_64Assembler* assembler, 665 Location pos, 666 CpuRegister input, 667 Location length, 668 SlowPathCode* slow_path, 669 CpuRegister temp, 670 bool length_is_input_length = false) { 671 // Where is the length in the Array? 672 const uint32_t length_offset = mirror::Array::LengthOffset().Uint32Value(); 673 674 if (pos.IsConstant()) { 675 int32_t pos_const = pos.GetConstant()->AsIntConstant()->GetValue(); 676 if (pos_const == 0) { 677 if (!length_is_input_length) { 678 // Check that length(input) >= length. 679 if (length.IsConstant()) { 680 __ cmpl(Address(input, length_offset), 681 Immediate(length.GetConstant()->AsIntConstant()->GetValue())); 682 } else { 683 __ cmpl(Address(input, length_offset), length.AsRegister<CpuRegister>()); 684 } 685 __ j(kLess, slow_path->GetEntryLabel()); 686 } 687 } else { 688 // Check that length(input) >= pos. 689 __ movl(temp, Address(input, length_offset)); 690 __ subl(temp, Immediate(pos_const)); 691 __ j(kLess, slow_path->GetEntryLabel()); 692 693 // Check that (length(input) - pos) >= length. 694 if (length.IsConstant()) { 695 __ cmpl(temp, Immediate(length.GetConstant()->AsIntConstant()->GetValue())); 696 } else { 697 __ cmpl(temp, length.AsRegister<CpuRegister>()); 698 } 699 __ j(kLess, slow_path->GetEntryLabel()); 700 } 701 } else if (length_is_input_length) { 702 // The only way the copy can succeed is if pos is zero. 703 CpuRegister pos_reg = pos.AsRegister<CpuRegister>(); 704 __ testl(pos_reg, pos_reg); 705 __ j(kNotEqual, slow_path->GetEntryLabel()); 706 } else { 707 // Check that pos >= 0. 708 CpuRegister pos_reg = pos.AsRegister<CpuRegister>(); 709 __ testl(pos_reg, pos_reg); 710 __ j(kLess, slow_path->GetEntryLabel()); 711 712 // Check that pos <= length(input). 713 __ cmpl(Address(input, length_offset), pos_reg); 714 __ j(kLess, slow_path->GetEntryLabel()); 715 716 // Check that (length(input) - pos) >= length. 717 __ movl(temp, Address(input, length_offset)); 718 __ subl(temp, pos_reg); 719 if (length.IsConstant()) { 720 __ cmpl(temp, Immediate(length.GetConstant()->AsIntConstant()->GetValue())); 721 } else { 722 __ cmpl(temp, length.AsRegister<CpuRegister>()); 723 } 724 __ j(kLess, slow_path->GetEntryLabel()); 725 } 726 } 727 728 void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) { 729 X86_64Assembler* assembler = GetAssembler(); 730 LocationSummary* locations = invoke->GetLocations(); 731 732 CpuRegister src = locations->InAt(0).AsRegister<CpuRegister>(); 733 Location src_pos = locations->InAt(1); 734 CpuRegister dest = locations->InAt(2).AsRegister<CpuRegister>(); 735 Location dest_pos = locations->InAt(3); 736 Location length = locations->InAt(4); 737 738 // Temporaries that we need for MOVSW. 739 CpuRegister src_base = locations->GetTemp(0).AsRegister<CpuRegister>(); 740 DCHECK_EQ(src_base.AsRegister(), RSI); 741 CpuRegister dest_base = locations->GetTemp(1).AsRegister<CpuRegister>(); 742 DCHECK_EQ(dest_base.AsRegister(), RDI); 743 CpuRegister count = locations->GetTemp(2).AsRegister<CpuRegister>(); 744 DCHECK_EQ(count.AsRegister(), RCX); 745 746 SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke); 747 codegen_->AddSlowPath(slow_path); 748 749 // Bail out if the source and destination are the same. 750 __ cmpl(src, dest); 751 __ j(kEqual, slow_path->GetEntryLabel()); 752 753 // Bail out if the source is null. 754 __ testl(src, src); 755 __ j(kEqual, slow_path->GetEntryLabel()); 756 757 // Bail out if the destination is null. 758 __ testl(dest, dest); 759 __ j(kEqual, slow_path->GetEntryLabel()); 760 761 // If the length is negative, bail out. 762 // We have already checked in the LocationsBuilder for the constant case. 763 if (!length.IsConstant()) { 764 __ testl(length.AsRegister<CpuRegister>(), length.AsRegister<CpuRegister>()); 765 __ j(kLess, slow_path->GetEntryLabel()); 766 } 767 768 // Validity checks: source. Use src_base as a temporary register. 769 CheckPosition(assembler, src_pos, src, length, slow_path, src_base); 770 771 // Validity checks: dest. Use src_base as a temporary register. 772 CheckPosition(assembler, dest_pos, dest, length, slow_path, src_base); 773 774 // We need the count in RCX. 775 if (length.IsConstant()) { 776 __ movl(count, Immediate(length.GetConstant()->AsIntConstant()->GetValue())); 777 } else { 778 __ movl(count, length.AsRegister<CpuRegister>()); 779 } 780 781 // Okay, everything checks out. Finally time to do the copy. 782 // Check assumption that sizeof(Char) is 2 (used in scaling below). 783 const size_t char_size = DataType::Size(DataType::Type::kUint16); 784 DCHECK_EQ(char_size, 2u); 785 786 const uint32_t data_offset = mirror::Array::DataOffset(char_size).Uint32Value(); 787 788 if (src_pos.IsConstant()) { 789 int32_t src_pos_const = src_pos.GetConstant()->AsIntConstant()->GetValue(); 790 __ leal(src_base, Address(src, char_size * src_pos_const + data_offset)); 791 } else { 792 __ leal(src_base, Address(src, src_pos.AsRegister<CpuRegister>(), 793 ScaleFactor::TIMES_2, data_offset)); 794 } 795 if (dest_pos.IsConstant()) { 796 int32_t dest_pos_const = dest_pos.GetConstant()->AsIntConstant()->GetValue(); 797 __ leal(dest_base, Address(dest, char_size * dest_pos_const + data_offset)); 798 } else { 799 __ leal(dest_base, Address(dest, dest_pos.AsRegister<CpuRegister>(), 800 ScaleFactor::TIMES_2, data_offset)); 801 } 802 803 // Do the move. 804 __ rep_movsw(); 805 806 __ Bind(slow_path->GetExitLabel()); 807 } 808 809 810 void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopy(HInvoke* invoke) { 811 // The only read barrier implementation supporting the 812 // SystemArrayCopy intrinsic is the Baker-style read barriers. 813 if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) { 814 return; 815 } 816 817 CodeGenerator::CreateSystemArrayCopyLocationSummary(invoke); 818 } 819 820 // Compute base source address, base destination address, and end 821 // source address for the System.arraycopy intrinsic in `src_base`, 822 // `dst_base` and `src_end` respectively. 823 static void GenSystemArrayCopyAddresses(X86_64Assembler* assembler, 824 DataType::Type type, 825 const CpuRegister& src, 826 const Location& src_pos, 827 const CpuRegister& dst, 828 const Location& dst_pos, 829 const Location& copy_length, 830 const CpuRegister& src_base, 831 const CpuRegister& dst_base, 832 const CpuRegister& src_end) { 833 // This routine is only used by the SystemArrayCopy intrinsic. 834 DCHECK_EQ(type, DataType::Type::kReference); 835 const int32_t element_size = DataType::Size(type); 836 const ScaleFactor scale_factor = static_cast<ScaleFactor>(DataType::SizeShift(type)); 837 const uint32_t data_offset = mirror::Array::DataOffset(element_size).Uint32Value(); 838 839 if (src_pos.IsConstant()) { 840 int32_t constant = src_pos.GetConstant()->AsIntConstant()->GetValue(); 841 __ leal(src_base, Address(src, element_size * constant + data_offset)); 842 } else { 843 __ leal(src_base, Address(src, src_pos.AsRegister<CpuRegister>(), scale_factor, data_offset)); 844 } 845 846 if (dst_pos.IsConstant()) { 847 int32_t constant = dst_pos.GetConstant()->AsIntConstant()->GetValue(); 848 __ leal(dst_base, Address(dst, element_size * constant + data_offset)); 849 } else { 850 __ leal(dst_base, Address(dst, dst_pos.AsRegister<CpuRegister>(), scale_factor, data_offset)); 851 } 852 853 if (copy_length.IsConstant()) { 854 int32_t constant = copy_length.GetConstant()->AsIntConstant()->GetValue(); 855 __ leal(src_end, Address(src_base, element_size * constant)); 856 } else { 857 __ leal(src_end, Address(src_base, copy_length.AsRegister<CpuRegister>(), scale_factor, 0)); 858 } 859 } 860 861 void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { 862 // The only read barrier implementation supporting the 863 // SystemArrayCopy intrinsic is the Baker-style read barriers. 864 DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier); 865 866 X86_64Assembler* assembler = GetAssembler(); 867 LocationSummary* locations = invoke->GetLocations(); 868 869 uint32_t class_offset = mirror::Object::ClassOffset().Int32Value(); 870 uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value(); 871 uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value(); 872 uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value(); 873 uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); 874 875 CpuRegister src = locations->InAt(0).AsRegister<CpuRegister>(); 876 Location src_pos = locations->InAt(1); 877 CpuRegister dest = locations->InAt(2).AsRegister<CpuRegister>(); 878 Location dest_pos = locations->InAt(3); 879 Location length = locations->InAt(4); 880 Location temp1_loc = locations->GetTemp(0); 881 CpuRegister temp1 = temp1_loc.AsRegister<CpuRegister>(); 882 Location temp2_loc = locations->GetTemp(1); 883 CpuRegister temp2 = temp2_loc.AsRegister<CpuRegister>(); 884 Location temp3_loc = locations->GetTemp(2); 885 CpuRegister temp3 = temp3_loc.AsRegister<CpuRegister>(); 886 Location TMP_loc = Location::RegisterLocation(TMP); 887 888 SlowPathCode* intrinsic_slow_path = 889 new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke); 890 codegen_->AddSlowPath(intrinsic_slow_path); 891 892 NearLabel conditions_on_positions_validated; 893 SystemArrayCopyOptimizations optimizations(invoke); 894 895 // If source and destination are the same, we go to slow path if we need to do 896 // forward copying. 897 if (src_pos.IsConstant()) { 898 int32_t src_pos_constant = src_pos.GetConstant()->AsIntConstant()->GetValue(); 899 if (dest_pos.IsConstant()) { 900 int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); 901 if (optimizations.GetDestinationIsSource()) { 902 // Checked when building locations. 903 DCHECK_GE(src_pos_constant, dest_pos_constant); 904 } else if (src_pos_constant < dest_pos_constant) { 905 __ cmpl(src, dest); 906 __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); 907 } 908 } else { 909 if (!optimizations.GetDestinationIsSource()) { 910 __ cmpl(src, dest); 911 __ j(kNotEqual, &conditions_on_positions_validated); 912 } 913 __ cmpl(dest_pos.AsRegister<CpuRegister>(), Immediate(src_pos_constant)); 914 __ j(kGreater, intrinsic_slow_path->GetEntryLabel()); 915 } 916 } else { 917 if (!optimizations.GetDestinationIsSource()) { 918 __ cmpl(src, dest); 919 __ j(kNotEqual, &conditions_on_positions_validated); 920 } 921 if (dest_pos.IsConstant()) { 922 int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); 923 __ cmpl(src_pos.AsRegister<CpuRegister>(), Immediate(dest_pos_constant)); 924 __ j(kLess, intrinsic_slow_path->GetEntryLabel()); 925 } else { 926 __ cmpl(src_pos.AsRegister<CpuRegister>(), dest_pos.AsRegister<CpuRegister>()); 927 __ j(kLess, intrinsic_slow_path->GetEntryLabel()); 928 } 929 } 930 931 __ Bind(&conditions_on_positions_validated); 932 933 if (!optimizations.GetSourceIsNotNull()) { 934 // Bail out if the source is null. 935 __ testl(src, src); 936 __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); 937 } 938 939 if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) { 940 // Bail out if the destination is null. 941 __ testl(dest, dest); 942 __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); 943 } 944 945 // If the length is negative, bail out. 946 // We have already checked in the LocationsBuilder for the constant case. 947 if (!length.IsConstant() && 948 !optimizations.GetCountIsSourceLength() && 949 !optimizations.GetCountIsDestinationLength()) { 950 __ testl(length.AsRegister<CpuRegister>(), length.AsRegister<CpuRegister>()); 951 __ j(kLess, intrinsic_slow_path->GetEntryLabel()); 952 } 953 954 // Validity checks: source. 955 CheckPosition(assembler, 956 src_pos, 957 src, 958 length, 959 intrinsic_slow_path, 960 temp1, 961 optimizations.GetCountIsSourceLength()); 962 963 // Validity checks: dest. 964 CheckPosition(assembler, 965 dest_pos, 966 dest, 967 length, 968 intrinsic_slow_path, 969 temp1, 970 optimizations.GetCountIsDestinationLength()); 971 972 if (!optimizations.GetDoesNotNeedTypeCheck()) { 973 // Check whether all elements of the source array are assignable to the component 974 // type of the destination array. We do two checks: the classes are the same, 975 // or the destination is Object[]. If none of these checks succeed, we go to the 976 // slow path. 977 978 bool did_unpoison = false; 979 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { 980 // /* HeapReference<Class> */ temp1 = dest->klass_ 981 codegen_->GenerateFieldLoadWithBakerReadBarrier( 982 invoke, temp1_loc, dest, class_offset, /* needs_null_check= */ false); 983 // Register `temp1` is not trashed by the read barrier emitted 984 // by GenerateFieldLoadWithBakerReadBarrier below, as that 985 // method produces a call to a ReadBarrierMarkRegX entry point, 986 // which saves all potentially live registers, including 987 // temporaries such a `temp1`. 988 // /* HeapReference<Class> */ temp2 = src->klass_ 989 codegen_->GenerateFieldLoadWithBakerReadBarrier( 990 invoke, temp2_loc, src, class_offset, /* needs_null_check= */ false); 991 // If heap poisoning is enabled, `temp1` and `temp2` have been 992 // unpoisoned by the the previous calls to 993 // GenerateFieldLoadWithBakerReadBarrier. 994 } else { 995 // /* HeapReference<Class> */ temp1 = dest->klass_ 996 __ movl(temp1, Address(dest, class_offset)); 997 // /* HeapReference<Class> */ temp2 = src->klass_ 998 __ movl(temp2, Address(src, class_offset)); 999 if (!optimizations.GetDestinationIsNonPrimitiveArray() || 1000 !optimizations.GetSourceIsNonPrimitiveArray()) { 1001 // One or two of the references need to be unpoisoned. Unpoison them 1002 // both to make the identity check valid. 1003 __ MaybeUnpoisonHeapReference(temp1); 1004 __ MaybeUnpoisonHeapReference(temp2); 1005 did_unpoison = true; 1006 } 1007 } 1008 1009 if (!optimizations.GetDestinationIsNonPrimitiveArray()) { 1010 // Bail out if the destination is not a non primitive array. 1011 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { 1012 // /* HeapReference<Class> */ TMP = temp1->component_type_ 1013 codegen_->GenerateFieldLoadWithBakerReadBarrier( 1014 invoke, TMP_loc, temp1, component_offset, /* needs_null_check= */ false); 1015 __ testl(CpuRegister(TMP), CpuRegister(TMP)); 1016 __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); 1017 // If heap poisoning is enabled, `TMP` has been unpoisoned by 1018 // the the previous call to GenerateFieldLoadWithBakerReadBarrier. 1019 } else { 1020 // /* HeapReference<Class> */ TMP = temp1->component_type_ 1021 __ movl(CpuRegister(TMP), Address(temp1, component_offset)); 1022 __ testl(CpuRegister(TMP), CpuRegister(TMP)); 1023 __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); 1024 __ MaybeUnpoisonHeapReference(CpuRegister(TMP)); 1025 } 1026 __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot)); 1027 __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); 1028 } 1029 1030 if (!optimizations.GetSourceIsNonPrimitiveArray()) { 1031 // Bail out if the source is not a non primitive array. 1032 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { 1033 // For the same reason given earlier, `temp1` is not trashed by the 1034 // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below. 1035 // /* HeapReference<Class> */ TMP = temp2->component_type_ 1036 codegen_->GenerateFieldLoadWithBakerReadBarrier( 1037 invoke, TMP_loc, temp2, component_offset, /* needs_null_check= */ false); 1038 __ testl(CpuRegister(TMP), CpuRegister(TMP)); 1039 __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); 1040 // If heap poisoning is enabled, `TMP` has been unpoisoned by 1041 // the the previous call to GenerateFieldLoadWithBakerReadBarrier. 1042 } else { 1043 // /* HeapReference<Class> */ TMP = temp2->component_type_ 1044 __ movl(CpuRegister(TMP), Address(temp2, component_offset)); 1045 __ testl(CpuRegister(TMP), CpuRegister(TMP)); 1046 __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); 1047 __ MaybeUnpoisonHeapReference(CpuRegister(TMP)); 1048 } 1049 __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot)); 1050 __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); 1051 } 1052 1053 __ cmpl(temp1, temp2); 1054 1055 if (optimizations.GetDestinationIsTypedObjectArray()) { 1056 NearLabel do_copy; 1057 __ j(kEqual, &do_copy); 1058 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { 1059 // /* HeapReference<Class> */ temp1 = temp1->component_type_ 1060 codegen_->GenerateFieldLoadWithBakerReadBarrier( 1061 invoke, temp1_loc, temp1, component_offset, /* needs_null_check= */ false); 1062 // We do not need to emit a read barrier for the following 1063 // heap reference load, as `temp1` is only used in a 1064 // comparison with null below, and this reference is not 1065 // kept afterwards. 1066 __ cmpl(Address(temp1, super_offset), Immediate(0)); 1067 } else { 1068 if (!did_unpoison) { 1069 __ MaybeUnpoisonHeapReference(temp1); 1070 } 1071 // /* HeapReference<Class> */ temp1 = temp1->component_type_ 1072 __ movl(temp1, Address(temp1, component_offset)); 1073 __ MaybeUnpoisonHeapReference(temp1); 1074 // No need to unpoison the following heap reference load, as 1075 // we're comparing against null. 1076 __ cmpl(Address(temp1, super_offset), Immediate(0)); 1077 } 1078 __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); 1079 __ Bind(&do_copy); 1080 } else { 1081 __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); 1082 } 1083 } else if (!optimizations.GetSourceIsNonPrimitiveArray()) { 1084 DCHECK(optimizations.GetDestinationIsNonPrimitiveArray()); 1085 // Bail out if the source is not a non primitive array. 1086 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { 1087 // /* HeapReference<Class> */ temp1 = src->klass_ 1088 codegen_->GenerateFieldLoadWithBakerReadBarrier( 1089 invoke, temp1_loc, src, class_offset, /* needs_null_check= */ false); 1090 // /* HeapReference<Class> */ TMP = temp1->component_type_ 1091 codegen_->GenerateFieldLoadWithBakerReadBarrier( 1092 invoke, TMP_loc, temp1, component_offset, /* needs_null_check= */ false); 1093 __ testl(CpuRegister(TMP), CpuRegister(TMP)); 1094 __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); 1095 } else { 1096 // /* HeapReference<Class> */ temp1 = src->klass_ 1097 __ movl(temp1, Address(src, class_offset)); 1098 __ MaybeUnpoisonHeapReference(temp1); 1099 // /* HeapReference<Class> */ TMP = temp1->component_type_ 1100 __ movl(CpuRegister(TMP), Address(temp1, component_offset)); 1101 // No need to unpoison `TMP` now, as we're comparing against null. 1102 __ testl(CpuRegister(TMP), CpuRegister(TMP)); 1103 __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); 1104 __ MaybeUnpoisonHeapReference(CpuRegister(TMP)); 1105 } 1106 __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot)); 1107 __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); 1108 } 1109 1110 const DataType::Type type = DataType::Type::kReference; 1111 const int32_t element_size = DataType::Size(type); 1112 1113 // Compute base source address, base destination address, and end 1114 // source address in `temp1`, `temp2` and `temp3` respectively. 1115 GenSystemArrayCopyAddresses( 1116 GetAssembler(), type, src, src_pos, dest, dest_pos, length, temp1, temp2, temp3); 1117 1118 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { 1119 // SystemArrayCopy implementation for Baker read barriers (see 1120 // also CodeGeneratorX86_64::GenerateReferenceLoadWithBakerReadBarrier): 1121 // 1122 // if (src_ptr != end_ptr) { 1123 // uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState(); 1124 // lfence; // Load fence or artificial data dependency to prevent load-load reordering 1125 // bool is_gray = (rb_state == ReadBarrier::GrayState()); 1126 // if (is_gray) { 1127 // // Slow-path copy. 1128 // do { 1129 // *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++))); 1130 // } while (src_ptr != end_ptr) 1131 // } else { 1132 // // Fast-path copy. 1133 // do { 1134 // *dest_ptr++ = *src_ptr++; 1135 // } while (src_ptr != end_ptr) 1136 // } 1137 // } 1138 1139 NearLabel loop, done; 1140 1141 // Don't enter copy loop if `length == 0`. 1142 __ cmpl(temp1, temp3); 1143 __ j(kEqual, &done); 1144 1145 // Given the numeric representation, it's enough to check the low bit of the rb_state. 1146 static_assert(ReadBarrier::NonGrayState() == 0, "Expecting non-gray to have value 0"); 1147 static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); 1148 constexpr uint32_t gray_byte_position = LockWord::kReadBarrierStateShift / kBitsPerByte; 1149 constexpr uint32_t gray_bit_position = LockWord::kReadBarrierStateShift % kBitsPerByte; 1150 constexpr int32_t test_value = static_cast<int8_t>(1 << gray_bit_position); 1151 1152 // if (rb_state == ReadBarrier::GrayState()) 1153 // goto slow_path; 1154 // At this point, just do the "if" and make sure that flags are preserved until the branch. 1155 __ testb(Address(src, monitor_offset + gray_byte_position), Immediate(test_value)); 1156 1157 // Load fence to prevent load-load reordering. 1158 // Note that this is a no-op, thanks to the x86-64 memory model. 1159 codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny); 1160 1161 // Slow path used to copy array when `src` is gray. 1162 SlowPathCode* read_barrier_slow_path = 1163 new (codegen_->GetScopedAllocator()) ReadBarrierSystemArrayCopySlowPathX86_64(invoke); 1164 codegen_->AddSlowPath(read_barrier_slow_path); 1165 1166 // We have done the "if" of the gray bit check above, now branch based on the flags. 1167 __ j(kNotZero, read_barrier_slow_path->GetEntryLabel()); 1168 1169 // Fast-path copy. 1170 // Iterate over the arrays and do a raw copy of the objects. We don't need to 1171 // poison/unpoison. 1172 __ Bind(&loop); 1173 __ movl(CpuRegister(TMP), Address(temp1, 0)); 1174 __ movl(Address(temp2, 0), CpuRegister(TMP)); 1175 __ addl(temp1, Immediate(element_size)); 1176 __ addl(temp2, Immediate(element_size)); 1177 __ cmpl(temp1, temp3); 1178 __ j(kNotEqual, &loop); 1179 1180 __ Bind(read_barrier_slow_path->GetExitLabel()); 1181 __ Bind(&done); 1182 } else { 1183 // Non read barrier code. 1184 1185 // Iterate over the arrays and do a raw copy of the objects. We don't need to 1186 // poison/unpoison. 1187 NearLabel loop, done; 1188 __ cmpl(temp1, temp3); 1189 __ j(kEqual, &done); 1190 __ Bind(&loop); 1191 __ movl(CpuRegister(TMP), Address(temp1, 0)); 1192 __ movl(Address(temp2, 0), CpuRegister(TMP)); 1193 __ addl(temp1, Immediate(element_size)); 1194 __ addl(temp2, Immediate(element_size)); 1195 __ cmpl(temp1, temp3); 1196 __ j(kNotEqual, &loop); 1197 __ Bind(&done); 1198 } 1199 1200 // We only need one card marking on the destination array. 1201 codegen_->MarkGCCard(temp1, temp2, dest, CpuRegister(kNoRegister), /* value_can_be_null= */ false); 1202 1203 __ Bind(intrinsic_slow_path->GetExitLabel()); 1204 } 1205 1206 void IntrinsicLocationsBuilderX86_64::VisitStringCompareTo(HInvoke* invoke) { 1207 LocationSummary* locations = new (allocator_) LocationSummary( 1208 invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); 1209 InvokeRuntimeCallingConvention calling_convention; 1210 locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); 1211 locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1))); 1212 locations->SetOut(Location::RegisterLocation(RAX)); 1213 } 1214 1215 void IntrinsicCodeGeneratorX86_64::VisitStringCompareTo(HInvoke* invoke) { 1216 X86_64Assembler* assembler = GetAssembler(); 1217 LocationSummary* locations = invoke->GetLocations(); 1218 1219 // Note that the null check must have been done earlier. 1220 DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0))); 1221 1222 CpuRegister argument = locations->InAt(1).AsRegister<CpuRegister>(); 1223 __ testl(argument, argument); 1224 SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke); 1225 codegen_->AddSlowPath(slow_path); 1226 __ j(kEqual, slow_path->GetEntryLabel()); 1227 1228 codegen_->InvokeRuntime(kQuickStringCompareTo, invoke, invoke->GetDexPc(), slow_path); 1229 __ Bind(slow_path->GetExitLabel()); 1230 } 1231 1232 void IntrinsicLocationsBuilderX86_64::VisitStringEquals(HInvoke* invoke) { 1233 LocationSummary* locations = 1234 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); 1235 locations->SetInAt(0, Location::RequiresRegister()); 1236 locations->SetInAt(1, Location::RequiresRegister()); 1237 1238 // Request temporary registers, RCX and RDI needed for repe_cmpsq instruction. 1239 locations->AddTemp(Location::RegisterLocation(RCX)); 1240 locations->AddTemp(Location::RegisterLocation(RDI)); 1241 1242 // Set output, RSI needed for repe_cmpsq instruction anyways. 1243 locations->SetOut(Location::RegisterLocation(RSI), Location::kOutputOverlap); 1244 } 1245 1246 void IntrinsicCodeGeneratorX86_64::VisitStringEquals(HInvoke* invoke) { 1247 X86_64Assembler* assembler = GetAssembler(); 1248 LocationSummary* locations = invoke->GetLocations(); 1249 1250 CpuRegister str = locations->InAt(0).AsRegister<CpuRegister>(); 1251 CpuRegister arg = locations->InAt(1).AsRegister<CpuRegister>(); 1252 CpuRegister rcx = locations->GetTemp(0).AsRegister<CpuRegister>(); 1253 CpuRegister rdi = locations->GetTemp(1).AsRegister<CpuRegister>(); 1254 CpuRegister rsi = locations->Out().AsRegister<CpuRegister>(); 1255 1256 NearLabel end, return_true, return_false; 1257 1258 // Get offsets of count, value, and class fields within a string object. 1259 const uint32_t count_offset = mirror::String::CountOffset().Uint32Value(); 1260 const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value(); 1261 const uint32_t class_offset = mirror::Object::ClassOffset().Uint32Value(); 1262 1263 // Note that the null check must have been done earlier. 1264 DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0))); 1265 1266 StringEqualsOptimizations optimizations(invoke); 1267 if (!optimizations.GetArgumentNotNull()) { 1268 // Check if input is null, return false if it is. 1269 __ testl(arg, arg); 1270 __ j(kEqual, &return_false); 1271 } 1272 1273 if (!optimizations.GetArgumentIsString()) { 1274 // Instanceof check for the argument by comparing class fields. 1275 // All string objects must have the same type since String cannot be subclassed. 1276 // Receiver must be a string object, so its class field is equal to all strings' class fields. 1277 // If the argument is a string object, its class field must be equal to receiver's class field. 1278 // 1279 // As the String class is expected to be non-movable, we can read the class 1280 // field from String.equals' arguments without read barriers. 1281 AssertNonMovableStringClass(); 1282 // Also, because we use the loaded class references only to compare them, we 1283 // don't need to unpoison them. 1284 // /* HeapReference<Class> */ rcx = str->klass_ 1285 __ movl(rcx, Address(str, class_offset)); 1286 // if (rcx != /* HeapReference<Class> */ arg->klass_) return false 1287 __ cmpl(rcx, Address(arg, class_offset)); 1288 __ j(kNotEqual, &return_false); 1289 } 1290 1291 // Reference equality check, return true if same reference. 1292 __ cmpl(str, arg); 1293 __ j(kEqual, &return_true); 1294 1295 // Load length and compression flag of receiver string. 1296 __ movl(rcx, Address(str, count_offset)); 1297 // Check if lengths and compressiond flags are equal, return false if they're not. 1298 // Two identical strings will always have same compression style since 1299 // compression style is decided on alloc. 1300 __ cmpl(rcx, Address(arg, count_offset)); 1301 __ j(kNotEqual, &return_false); 1302 // Return true if both strings are empty. Even with string compression `count == 0` means empty. 1303 static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u, 1304 "Expecting 0=compressed, 1=uncompressed"); 1305 __ jrcxz(&return_true); 1306 1307 if (mirror::kUseStringCompression) { 1308 NearLabel string_uncompressed; 1309 // Extract length and differentiate between both compressed or both uncompressed. 1310 // Different compression style is cut above. 1311 __ shrl(rcx, Immediate(1)); 1312 __ j(kCarrySet, &string_uncompressed); 1313 // Divide string length by 2, rounding up, and continue as if uncompressed. 1314 // Merge clearing the compression flag with +1 for rounding. 1315 __ addl(rcx, Immediate(1)); 1316 __ shrl(rcx, Immediate(1)); 1317 __ Bind(&string_uncompressed); 1318 } 1319 // Load starting addresses of string values into RSI/RDI as required for repe_cmpsq instruction. 1320 __ leal(rsi, Address(str, value_offset)); 1321 __ leal(rdi, Address(arg, value_offset)); 1322 1323 // Divide string length by 4 and adjust for lengths not divisible by 4. 1324 __ addl(rcx, Immediate(3)); 1325 __ shrl(rcx, Immediate(2)); 1326 1327 // Assertions that must hold in order to compare strings 4 characters (uncompressed) 1328 // or 8 characters (compressed) at a time. 1329 DCHECK_ALIGNED(value_offset, 8); 1330 static_assert(IsAligned<8>(kObjectAlignment), "String is not zero padded"); 1331 1332 // Loop to compare strings four characters at a time starting at the beginning of the string. 1333 __ repe_cmpsq(); 1334 // If strings are not equal, zero flag will be cleared. 1335 __ j(kNotEqual, &return_false); 1336 1337 // Return true and exit the function. 1338 // If loop does not result in returning false, we return true. 1339 __ Bind(&return_true); 1340 __ movl(rsi, Immediate(1)); 1341 __ jmp(&end); 1342 1343 // Return false and exit the function. 1344 __ Bind(&return_false); 1345 __ xorl(rsi, rsi); 1346 __ Bind(&end); 1347 } 1348 1349 static void CreateStringIndexOfLocations(HInvoke* invoke, 1350 ArenaAllocator* allocator, 1351 bool start_at_zero) { 1352 LocationSummary* locations = new (allocator) LocationSummary(invoke, 1353 LocationSummary::kCallOnSlowPath, 1354 kIntrinsified); 1355 // The data needs to be in RDI for scasw. So request that the string is there, anyways. 1356 locations->SetInAt(0, Location::RegisterLocation(RDI)); 1357 // If we look for a constant char, we'll still have to copy it into RAX. So just request the 1358 // allocator to do that, anyways. We can still do the constant check by checking the parameter 1359 // of the instruction explicitly. 1360 // Note: This works as we don't clobber RAX anywhere. 1361 locations->SetInAt(1, Location::RegisterLocation(RAX)); 1362 if (!start_at_zero) { 1363 locations->SetInAt(2, Location::RequiresRegister()); // The starting index. 1364 } 1365 // As we clobber RDI during execution anyways, also use it as the output. 1366 locations->SetOut(Location::SameAsFirstInput()); 1367 1368 // repne scasw uses RCX as the counter. 1369 locations->AddTemp(Location::RegisterLocation(RCX)); 1370 // Need another temporary to be able to compute the result. 1371 locations->AddTemp(Location::RequiresRegister()); 1372 } 1373 1374 static void GenerateStringIndexOf(HInvoke* invoke, 1375 X86_64Assembler* assembler, 1376 CodeGeneratorX86_64* codegen, 1377 bool start_at_zero) { 1378 LocationSummary* locations = invoke->GetLocations(); 1379 1380 // Note that the null check must have been done earlier. 1381 DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0))); 1382 1383 CpuRegister string_obj = locations->InAt(0).AsRegister<CpuRegister>(); 1384 CpuRegister search_value = locations->InAt(1).AsRegister<CpuRegister>(); 1385 CpuRegister counter = locations->GetTemp(0).AsRegister<CpuRegister>(); 1386 CpuRegister string_length = locations->GetTemp(1).AsRegister<CpuRegister>(); 1387 CpuRegister out = locations->Out().AsRegister<CpuRegister>(); 1388 1389 // Check our assumptions for registers. 1390 DCHECK_EQ(string_obj.AsRegister(), RDI); 1391 DCHECK_EQ(search_value.AsRegister(), RAX); 1392 DCHECK_EQ(counter.AsRegister(), RCX); 1393 DCHECK_EQ(out.AsRegister(), RDI); 1394 1395 // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically, 1396 // or directly dispatch for a large constant, or omit slow-path for a small constant or a char. 1397 SlowPathCode* slow_path = nullptr; 1398 HInstruction* code_point = invoke->InputAt(1); 1399 if (code_point->IsIntConstant()) { 1400 if (static_cast<uint32_t>(code_point->AsIntConstant()->GetValue()) > 1401 std::numeric_limits<uint16_t>::max()) { 1402 // Always needs the slow-path. We could directly dispatch to it, but this case should be 1403 // rare, so for simplicity just put the full slow-path down and branch unconditionally. 1404 slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke); 1405 codegen->AddSlowPath(slow_path); 1406 __ jmp(slow_path->GetEntryLabel()); 1407 __ Bind(slow_path->GetExitLabel()); 1408 return; 1409 } 1410 } else if (code_point->GetType() != DataType::Type::kUint16) { 1411 __ cmpl(search_value, Immediate(std::numeric_limits<uint16_t>::max())); 1412 slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke); 1413 codegen->AddSlowPath(slow_path); 1414 __ j(kAbove, slow_path->GetEntryLabel()); 1415 } 1416 1417 // From here down, we know that we are looking for a char that fits in 1418 // 16 bits (uncompressed) or 8 bits (compressed). 1419 // Location of reference to data array within the String object. 1420 int32_t value_offset = mirror::String::ValueOffset().Int32Value(); 1421 // Location of count within the String object. 1422 int32_t count_offset = mirror::String::CountOffset().Int32Value(); 1423 1424 // Load the count field of the string containing the length and compression flag. 1425 __ movl(string_length, Address(string_obj, count_offset)); 1426 1427 // Do a zero-length check. Even with string compression `count == 0` means empty. 1428 // TODO: Support jecxz. 1429 NearLabel not_found_label; 1430 __ testl(string_length, string_length); 1431 __ j(kEqual, ¬_found_label); 1432 1433 if (mirror::kUseStringCompression) { 1434 // Use TMP to keep string_length_flagged. 1435 __ movl(CpuRegister(TMP), string_length); 1436 // Mask out first bit used as compression flag. 1437 __ shrl(string_length, Immediate(1)); 1438 } 1439 1440 if (start_at_zero) { 1441 // Number of chars to scan is the same as the string length. 1442 __ movl(counter, string_length); 1443 // Move to the start of the string. 1444 __ addq(string_obj, Immediate(value_offset)); 1445 } else { 1446 CpuRegister start_index = locations->InAt(2).AsRegister<CpuRegister>(); 1447 1448 // Do a start_index check. 1449 __ cmpl(start_index, string_length); 1450 __ j(kGreaterEqual, ¬_found_label); 1451 1452 // Ensure we have a start index >= 0; 1453 __ xorl(counter, counter); 1454 __ cmpl(start_index, Immediate(0)); 1455 __ cmov(kGreater, counter, start_index, /* is64bit= */ false); // 32-bit copy is enough. 1456 1457 if (mirror::kUseStringCompression) { 1458 NearLabel modify_counter, offset_uncompressed_label; 1459 __ testl(CpuRegister(TMP), Immediate(1)); 1460 __ j(kNotZero, &offset_uncompressed_label); 1461 __ leaq(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_1, value_offset)); 1462 __ jmp(&modify_counter); 1463 // Move to the start of the string: string_obj + value_offset + 2 * start_index. 1464 __ Bind(&offset_uncompressed_label); 1465 __ leaq(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_2, value_offset)); 1466 __ Bind(&modify_counter); 1467 } else { 1468 __ leaq(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_2, value_offset)); 1469 } 1470 // Now update ecx, the work counter: it's gonna be string.length - start_index. 1471 __ negq(counter); // Needs to be 64-bit negation, as the address computation is 64-bit. 1472 __ leaq(counter, Address(string_length, counter, ScaleFactor::TIMES_1, 0)); 1473 } 1474 1475 if (mirror::kUseStringCompression) { 1476 NearLabel uncompressed_string_comparison; 1477 NearLabel comparison_done; 1478 __ testl(CpuRegister(TMP), Immediate(1)); 1479 __ j(kNotZero, &uncompressed_string_comparison); 1480 // Check if RAX (search_value) is ASCII. 1481 __ cmpl(search_value, Immediate(127)); 1482 __ j(kGreater, ¬_found_label); 1483 // Comparing byte-per-byte. 1484 __ repne_scasb(); 1485 __ jmp(&comparison_done); 1486 // Everything is set up for repne scasw: 1487 // * Comparison address in RDI. 1488 // * Counter in ECX. 1489 __ Bind(&uncompressed_string_comparison); 1490 __ repne_scasw(); 1491 __ Bind(&comparison_done); 1492 } else { 1493 __ repne_scasw(); 1494 } 1495 // Did we find a match? 1496 __ j(kNotEqual, ¬_found_label); 1497 1498 // Yes, we matched. Compute the index of the result. 1499 __ subl(string_length, counter); 1500 __ leal(out, Address(string_length, -1)); 1501 1502 NearLabel done; 1503 __ jmp(&done); 1504 1505 // Failed to match; return -1. 1506 __ Bind(¬_found_label); 1507 __ movl(out, Immediate(-1)); 1508 1509 // And join up at the end. 1510 __ Bind(&done); 1511 if (slow_path != nullptr) { 1512 __ Bind(slow_path->GetExitLabel()); 1513 } 1514 } 1515 1516 void IntrinsicLocationsBuilderX86_64::VisitStringIndexOf(HInvoke* invoke) { 1517 CreateStringIndexOfLocations(invoke, allocator_, /* start_at_zero= */ true); 1518 } 1519 1520 void IntrinsicCodeGeneratorX86_64::VisitStringIndexOf(HInvoke* invoke) { 1521 GenerateStringIndexOf(invoke, GetAssembler(), codegen_, /* start_at_zero= */ true); 1522 } 1523 1524 void IntrinsicLocationsBuilderX86_64::VisitStringIndexOfAfter(HInvoke* invoke) { 1525 CreateStringIndexOfLocations(invoke, allocator_, /* start_at_zero= */ false); 1526 } 1527 1528 void IntrinsicCodeGeneratorX86_64::VisitStringIndexOfAfter(HInvoke* invoke) { 1529 GenerateStringIndexOf(invoke, GetAssembler(), codegen_, /* start_at_zero= */ false); 1530 } 1531 1532 void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromBytes(HInvoke* invoke) { 1533 LocationSummary* locations = new (allocator_) LocationSummary( 1534 invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); 1535 InvokeRuntimeCallingConvention calling_convention; 1536 locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); 1537 locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1))); 1538 locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2))); 1539 locations->SetInAt(3, Location::RegisterLocation(calling_convention.GetRegisterAt(3))); 1540 locations->SetOut(Location::RegisterLocation(RAX)); 1541 } 1542 1543 void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromBytes(HInvoke* invoke) { 1544 X86_64Assembler* assembler = GetAssembler(); 1545 LocationSummary* locations = invoke->GetLocations(); 1546 1547 CpuRegister byte_array = locations->InAt(0).AsRegister<CpuRegister>(); 1548 __ testl(byte_array, byte_array); 1549 SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke); 1550 codegen_->AddSlowPath(slow_path); 1551 __ j(kEqual, slow_path->GetEntryLabel()); 1552 1553 codegen_->InvokeRuntime(kQuickAllocStringFromBytes, invoke, invoke->GetDexPc()); 1554 CheckEntrypointTypes<kQuickAllocStringFromBytes, void*, void*, int32_t, int32_t, int32_t>(); 1555 __ Bind(slow_path->GetExitLabel()); 1556 } 1557 1558 void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromChars(HInvoke* invoke) { 1559 LocationSummary* locations = 1560 new (allocator_) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified); 1561 InvokeRuntimeCallingConvention calling_convention; 1562 locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); 1563 locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1))); 1564 locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2))); 1565 locations->SetOut(Location::RegisterLocation(RAX)); 1566 } 1567 1568 void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromChars(HInvoke* invoke) { 1569 // No need to emit code checking whether `locations->InAt(2)` is a null 1570 // pointer, as callers of the native method 1571 // 1572 // java.lang.StringFactory.newStringFromChars(int offset, int charCount, char[] data) 1573 // 1574 // all include a null check on `data` before calling that method. 1575 codegen_->InvokeRuntime(kQuickAllocStringFromChars, invoke, invoke->GetDexPc()); 1576 CheckEntrypointTypes<kQuickAllocStringFromChars, void*, int32_t, int32_t, void*>(); 1577 } 1578 1579 void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromString(HInvoke* invoke) { 1580 LocationSummary* locations = new (allocator_) LocationSummary( 1581 invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); 1582 InvokeRuntimeCallingConvention calling_convention; 1583 locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); 1584 locations->SetOut(Location::RegisterLocation(RAX)); 1585 } 1586 1587 void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromString(HInvoke* invoke) { 1588 X86_64Assembler* assembler = GetAssembler(); 1589 LocationSummary* locations = invoke->GetLocations(); 1590 1591 CpuRegister string_to_copy = locations->InAt(0).AsRegister<CpuRegister>(); 1592 __ testl(string_to_copy, string_to_copy); 1593 SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke); 1594 codegen_->AddSlowPath(slow_path); 1595 __ j(kEqual, slow_path->GetEntryLabel()); 1596 1597 codegen_->InvokeRuntime(kQuickAllocStringFromString, invoke, invoke->GetDexPc()); 1598 CheckEntrypointTypes<kQuickAllocStringFromString, void*, void*>(); 1599 __ Bind(slow_path->GetExitLabel()); 1600 } 1601 1602 void IntrinsicLocationsBuilderX86_64::VisitStringGetCharsNoCheck(HInvoke* invoke) { 1603 // public void getChars(int srcBegin, int srcEnd, char[] dst, int dstBegin); 1604 LocationSummary* locations = 1605 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); 1606 locations->SetInAt(0, Location::RequiresRegister()); 1607 locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1))); 1608 locations->SetInAt(2, Location::RequiresRegister()); 1609 locations->SetInAt(3, Location::RequiresRegister()); 1610 locations->SetInAt(4, Location::RequiresRegister()); 1611 1612 // And we need some temporaries. We will use REP MOVSW, so we need fixed registers. 1613 locations->AddTemp(Location::RegisterLocation(RSI)); 1614 locations->AddTemp(Location::RegisterLocation(RDI)); 1615 locations->AddTemp(Location::RegisterLocation(RCX)); 1616 } 1617 1618 void IntrinsicCodeGeneratorX86_64::VisitStringGetCharsNoCheck(HInvoke* invoke) { 1619 X86_64Assembler* assembler = GetAssembler(); 1620 LocationSummary* locations = invoke->GetLocations(); 1621 1622 size_t char_component_size = DataType::Size(DataType::Type::kUint16); 1623 // Location of data in char array buffer. 1624 const uint32_t data_offset = mirror::Array::DataOffset(char_component_size).Uint32Value(); 1625 // Location of char array data in string. 1626 const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value(); 1627 1628 // public void getChars(int srcBegin, int srcEnd, char[] dst, int dstBegin); 1629 CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>(); 1630 Location srcBegin = locations->InAt(1); 1631 int srcBegin_value = 1632 srcBegin.IsConstant() ? srcBegin.GetConstant()->AsIntConstant()->GetValue() : 0; 1633 CpuRegister srcEnd = locations->InAt(2).AsRegister<CpuRegister>(); 1634 CpuRegister dst = locations->InAt(3).AsRegister<CpuRegister>(); 1635 CpuRegister dstBegin = locations->InAt(4).AsRegister<CpuRegister>(); 1636 1637 // Check assumption that sizeof(Char) is 2 (used in scaling below). 1638 const size_t char_size = DataType::Size(DataType::Type::kUint16); 1639 DCHECK_EQ(char_size, 2u); 1640 1641 NearLabel done; 1642 // Compute the number of chars (words) to move. 1643 __ movl(CpuRegister(RCX), srcEnd); 1644 if (srcBegin.IsConstant()) { 1645 __ subl(CpuRegister(RCX), Immediate(srcBegin_value)); 1646 } else { 1647 DCHECK(srcBegin.IsRegister()); 1648 __ subl(CpuRegister(RCX), srcBegin.AsRegister<CpuRegister>()); 1649 } 1650 if (mirror::kUseStringCompression) { 1651 NearLabel copy_uncompressed, copy_loop; 1652 const size_t c_char_size = DataType::Size(DataType::Type::kInt8); 1653 DCHECK_EQ(c_char_size, 1u); 1654 // Location of count in string. 1655 const uint32_t count_offset = mirror::String::CountOffset().Uint32Value(); 1656 1657 __ testl(Address(obj, count_offset), Immediate(1)); 1658 static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u, 1659 "Expecting 0=compressed, 1=uncompressed"); 1660 __ j(kNotZero, ©_uncompressed); 1661 // Compute the address of the source string by adding the number of chars from 1662 // the source beginning to the value offset of a string. 1663 __ leaq(CpuRegister(RSI), 1664 CodeGeneratorX86_64::ArrayAddress(obj, srcBegin, TIMES_1, value_offset)); 1665 // Start the loop to copy String's value to Array of Char. 1666 __ leaq(CpuRegister(RDI), Address(dst, dstBegin, ScaleFactor::TIMES_2, data_offset)); 1667 1668 __ Bind(©_loop); 1669 __ jrcxz(&done); 1670 // Use TMP as temporary (convert byte from RSI to word). 1671 // TODO: Selecting RAX as the temporary and using LODSB/STOSW. 1672 __ movzxb(CpuRegister(TMP), Address(CpuRegister(RSI), 0)); 1673 __ movw(Address(CpuRegister(RDI), 0), CpuRegister(TMP)); 1674 __ leaq(CpuRegister(RDI), Address(CpuRegister(RDI), char_size)); 1675 __ leaq(CpuRegister(RSI), Address(CpuRegister(RSI), c_char_size)); 1676 // TODO: Add support for LOOP to X86_64Assembler. 1677 __ subl(CpuRegister(RCX), Immediate(1)); 1678 __ jmp(©_loop); 1679 1680 __ Bind(©_uncompressed); 1681 } 1682 1683 __ leaq(CpuRegister(RSI), 1684 CodeGeneratorX86_64::ArrayAddress(obj, srcBegin, TIMES_2, value_offset)); 1685 // Compute the address of the destination buffer. 1686 __ leaq(CpuRegister(RDI), Address(dst, dstBegin, ScaleFactor::TIMES_2, data_offset)); 1687 // Do the move. 1688 __ rep_movsw(); 1689 1690 __ Bind(&done); 1691 } 1692 1693 static void GenPeek(LocationSummary* locations, DataType::Type size, X86_64Assembler* assembler) { 1694 CpuRegister address = locations->InAt(0).AsRegister<CpuRegister>(); 1695 CpuRegister out = locations->Out().AsRegister<CpuRegister>(); // == address, here for clarity. 1696 // x86 allows unaligned access. We do not have to check the input or use specific instructions 1697 // to avoid a SIGBUS. 1698 switch (size) { 1699 case DataType::Type::kInt8: 1700 __ movsxb(out, Address(address, 0)); 1701 break; 1702 case DataType::Type::kInt16: 1703 __ movsxw(out, Address(address, 0)); 1704 break; 1705 case DataType::Type::kInt32: 1706 __ movl(out, Address(address, 0)); 1707 break; 1708 case DataType::Type::kInt64: 1709 __ movq(out, Address(address, 0)); 1710 break; 1711 default: 1712 LOG(FATAL) << "Type not recognized for peek: " << size; 1713 UNREACHABLE(); 1714 } 1715 } 1716 1717 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekByte(HInvoke* invoke) { 1718 CreateIntToIntLocations(allocator_, invoke); 1719 } 1720 1721 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekByte(HInvoke* invoke) { 1722 GenPeek(invoke->GetLocations(), DataType::Type::kInt8, GetAssembler()); 1723 } 1724 1725 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekIntNative(HInvoke* invoke) { 1726 CreateIntToIntLocations(allocator_, invoke); 1727 } 1728 1729 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekIntNative(HInvoke* invoke) { 1730 GenPeek(invoke->GetLocations(), DataType::Type::kInt32, GetAssembler()); 1731 } 1732 1733 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekLongNative(HInvoke* invoke) { 1734 CreateIntToIntLocations(allocator_, invoke); 1735 } 1736 1737 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekLongNative(HInvoke* invoke) { 1738 GenPeek(invoke->GetLocations(), DataType::Type::kInt64, GetAssembler()); 1739 } 1740 1741 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekShortNative(HInvoke* invoke) { 1742 CreateIntToIntLocations(allocator_, invoke); 1743 } 1744 1745 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekShortNative(HInvoke* invoke) { 1746 GenPeek(invoke->GetLocations(), DataType::Type::kInt16, GetAssembler()); 1747 } 1748 1749 static void CreateIntIntToVoidLocations(ArenaAllocator* allocator, HInvoke* invoke) { 1750 LocationSummary* locations = 1751 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); 1752 locations->SetInAt(0, Location::RequiresRegister()); 1753 locations->SetInAt(1, Location::RegisterOrInt32Constant(invoke->InputAt(1))); 1754 } 1755 1756 static void GenPoke(LocationSummary* locations, DataType::Type size, X86_64Assembler* assembler) { 1757 CpuRegister address = locations->InAt(0).AsRegister<CpuRegister>(); 1758 Location value = locations->InAt(1); 1759 // x86 allows unaligned access. We do not have to check the input or use specific instructions 1760 // to avoid a SIGBUS. 1761 switch (size) { 1762 case DataType::Type::kInt8: 1763 if (value.IsConstant()) { 1764 __ movb(Address(address, 0), 1765 Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant()))); 1766 } else { 1767 __ movb(Address(address, 0), value.AsRegister<CpuRegister>()); 1768 } 1769 break; 1770 case DataType::Type::kInt16: 1771 if (value.IsConstant()) { 1772 __ movw(Address(address, 0), 1773 Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant()))); 1774 } else { 1775 __ movw(Address(address, 0), value.AsRegister<CpuRegister>()); 1776 } 1777 break; 1778 case DataType::Type::kInt32: 1779 if (value.IsConstant()) { 1780 __ movl(Address(address, 0), 1781 Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant()))); 1782 } else { 1783 __ movl(Address(address, 0), value.AsRegister<CpuRegister>()); 1784 } 1785 break; 1786 case DataType::Type::kInt64: 1787 if (value.IsConstant()) { 1788 int64_t v = value.GetConstant()->AsLongConstant()->GetValue(); 1789 DCHECK(IsInt<32>(v)); 1790 int32_t v_32 = v; 1791 __ movq(Address(address, 0), Immediate(v_32)); 1792 } else { 1793 __ movq(Address(address, 0), value.AsRegister<CpuRegister>()); 1794 } 1795 break; 1796 default: 1797 LOG(FATAL) << "Type not recognized for poke: " << size; 1798 UNREACHABLE(); 1799 } 1800 } 1801 1802 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeByte(HInvoke* invoke) { 1803 CreateIntIntToVoidLocations(allocator_, invoke); 1804 } 1805 1806 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeByte(HInvoke* invoke) { 1807 GenPoke(invoke->GetLocations(), DataType::Type::kInt8, GetAssembler()); 1808 } 1809 1810 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeIntNative(HInvoke* invoke) { 1811 CreateIntIntToVoidLocations(allocator_, invoke); 1812 } 1813 1814 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeIntNative(HInvoke* invoke) { 1815 GenPoke(invoke->GetLocations(), DataType::Type::kInt32, GetAssembler()); 1816 } 1817 1818 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeLongNative(HInvoke* invoke) { 1819 CreateIntIntToVoidLocations(allocator_, invoke); 1820 } 1821 1822 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeLongNative(HInvoke* invoke) { 1823 GenPoke(invoke->GetLocations(), DataType::Type::kInt64, GetAssembler()); 1824 } 1825 1826 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeShortNative(HInvoke* invoke) { 1827 CreateIntIntToVoidLocations(allocator_, invoke); 1828 } 1829 1830 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeShortNative(HInvoke* invoke) { 1831 GenPoke(invoke->GetLocations(), DataType::Type::kInt16, GetAssembler()); 1832 } 1833 1834 void IntrinsicLocationsBuilderX86_64::VisitThreadCurrentThread(HInvoke* invoke) { 1835 LocationSummary* locations = 1836 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); 1837 locations->SetOut(Location::RequiresRegister()); 1838 } 1839 1840 void IntrinsicCodeGeneratorX86_64::VisitThreadCurrentThread(HInvoke* invoke) { 1841 CpuRegister out = invoke->GetLocations()->Out().AsRegister<CpuRegister>(); 1842 GetAssembler()->gs()->movl(out, Address::Absolute(Thread::PeerOffset<kX86_64PointerSize>(), 1843 /* no_rip= */ true)); 1844 } 1845 1846 static void GenUnsafeGet(HInvoke* invoke, 1847 DataType::Type type, 1848 bool is_volatile ATTRIBUTE_UNUSED, 1849 CodeGeneratorX86_64* codegen) { 1850 X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler()); 1851 LocationSummary* locations = invoke->GetLocations(); 1852 Location base_loc = locations->InAt(1); 1853 CpuRegister base = base_loc.AsRegister<CpuRegister>(); 1854 Location offset_loc = locations->InAt(2); 1855 CpuRegister offset = offset_loc.AsRegister<CpuRegister>(); 1856 Location output_loc = locations->Out(); 1857 CpuRegister output = output_loc.AsRegister<CpuRegister>(); 1858 1859 switch (type) { 1860 case DataType::Type::kInt32: 1861 __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0)); 1862 break; 1863 1864 case DataType::Type::kReference: { 1865 if (kEmitCompilerReadBarrier) { 1866 if (kUseBakerReadBarrier) { 1867 Address src(base, offset, ScaleFactor::TIMES_1, 0); 1868 codegen->GenerateReferenceLoadWithBakerReadBarrier( 1869 invoke, output_loc, base, src, /* needs_null_check= */ false); 1870 } else { 1871 __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0)); 1872 codegen->GenerateReadBarrierSlow( 1873 invoke, output_loc, output_loc, base_loc, 0U, offset_loc); 1874 } 1875 } else { 1876 __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0)); 1877 __ MaybeUnpoisonHeapReference(output); 1878 } 1879 break; 1880 } 1881 1882 case DataType::Type::kInt64: 1883 __ movq(output, Address(base, offset, ScaleFactor::TIMES_1, 0)); 1884 break; 1885 1886 default: 1887 LOG(FATAL) << "Unsupported op size " << type; 1888 UNREACHABLE(); 1889 } 1890 } 1891 1892 static void CreateIntIntIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) { 1893 bool can_call = kEmitCompilerReadBarrier && 1894 (invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject || 1895 invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile); 1896 LocationSummary* locations = 1897 new (allocator) LocationSummary(invoke, 1898 can_call 1899 ? LocationSummary::kCallOnSlowPath 1900 : LocationSummary::kNoCall, 1901 kIntrinsified); 1902 if (can_call && kUseBakerReadBarrier) { 1903 locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. 1904 } 1905 locations->SetInAt(0, Location::NoLocation()); // Unused receiver. 1906 locations->SetInAt(1, Location::RequiresRegister()); 1907 locations->SetInAt(2, Location::RequiresRegister()); 1908 locations->SetOut(Location::RequiresRegister(), 1909 (can_call ? Location::kOutputOverlap : Location::kNoOutputOverlap)); 1910 } 1911 1912 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGet(HInvoke* invoke) { 1913 CreateIntIntIntToIntLocations(allocator_, invoke); 1914 } 1915 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetVolatile(HInvoke* invoke) { 1916 CreateIntIntIntToIntLocations(allocator_, invoke); 1917 } 1918 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetLong(HInvoke* invoke) { 1919 CreateIntIntIntToIntLocations(allocator_, invoke); 1920 } 1921 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetLongVolatile(HInvoke* invoke) { 1922 CreateIntIntIntToIntLocations(allocator_, invoke); 1923 } 1924 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetObject(HInvoke* invoke) { 1925 CreateIntIntIntToIntLocations(allocator_, invoke); 1926 } 1927 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) { 1928 CreateIntIntIntToIntLocations(allocator_, invoke); 1929 } 1930 1931 1932 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGet(HInvoke* invoke) { 1933 GenUnsafeGet(invoke, DataType::Type::kInt32, /* is_volatile= */ false, codegen_); 1934 } 1935 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetVolatile(HInvoke* invoke) { 1936 GenUnsafeGet(invoke, DataType::Type::kInt32, /* is_volatile= */ true, codegen_); 1937 } 1938 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetLong(HInvoke* invoke) { 1939 GenUnsafeGet(invoke, DataType::Type::kInt64, /* is_volatile= */ false, codegen_); 1940 } 1941 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetLongVolatile(HInvoke* invoke) { 1942 GenUnsafeGet(invoke, DataType::Type::kInt64, /* is_volatile= */ true, codegen_); 1943 } 1944 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetObject(HInvoke* invoke) { 1945 GenUnsafeGet(invoke, DataType::Type::kReference, /* is_volatile= */ false, codegen_); 1946 } 1947 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) { 1948 GenUnsafeGet(invoke, DataType::Type::kReference, /* is_volatile= */ true, codegen_); 1949 } 1950 1951 1952 static void CreateIntIntIntIntToVoidPlusTempsLocations(ArenaAllocator* allocator, 1953 DataType::Type type, 1954 HInvoke* invoke) { 1955 LocationSummary* locations = 1956 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); 1957 locations->SetInAt(0, Location::NoLocation()); // Unused receiver. 1958 locations->SetInAt(1, Location::RequiresRegister()); 1959 locations->SetInAt(2, Location::RequiresRegister()); 1960 locations->SetInAt(3, Location::RequiresRegister()); 1961 if (type == DataType::Type::kReference) { 1962 // Need temp registers for card-marking. 1963 locations->AddTemp(Location::RequiresRegister()); // Possibly used for reference poisoning too. 1964 locations->AddTemp(Location::RequiresRegister()); 1965 } 1966 } 1967 1968 void IntrinsicLocationsBuilderX86_64::VisitUnsafePut(HInvoke* invoke) { 1969 CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt32, invoke); 1970 } 1971 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutOrdered(HInvoke* invoke) { 1972 CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt32, invoke); 1973 } 1974 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutVolatile(HInvoke* invoke) { 1975 CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt32, invoke); 1976 } 1977 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObject(HInvoke* invoke) { 1978 CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kReference, invoke); 1979 } 1980 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObjectOrdered(HInvoke* invoke) { 1981 CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kReference, invoke); 1982 } 1983 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObjectVolatile(HInvoke* invoke) { 1984 CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kReference, invoke); 1985 } 1986 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLong(HInvoke* invoke) { 1987 CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt64, invoke); 1988 } 1989 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLongOrdered(HInvoke* invoke) { 1990 CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt64, invoke); 1991 } 1992 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLongVolatile(HInvoke* invoke) { 1993 CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt64, invoke); 1994 } 1995 1996 // We don't care for ordered: it requires an AnyStore barrier, which is already given by the x86 1997 // memory model. 1998 static void GenUnsafePut(LocationSummary* locations, DataType::Type type, bool is_volatile, 1999 CodeGeneratorX86_64* codegen) { 2000 X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler()); 2001 CpuRegister base = locations->InAt(1).AsRegister<CpuRegister>(); 2002 CpuRegister offset = locations->InAt(2).AsRegister<CpuRegister>(); 2003 CpuRegister value = locations->InAt(3).AsRegister<CpuRegister>(); 2004 2005 if (type == DataType::Type::kInt64) { 2006 __ movq(Address(base, offset, ScaleFactor::TIMES_1, 0), value); 2007 } else if (kPoisonHeapReferences && type == DataType::Type::kReference) { 2008 CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>(); 2009 __ movl(temp, value); 2010 __ PoisonHeapReference(temp); 2011 __ movl(Address(base, offset, ScaleFactor::TIMES_1, 0), temp); 2012 } else { 2013 __ movl(Address(base, offset, ScaleFactor::TIMES_1, 0), value); 2014 } 2015 2016 if (is_volatile) { 2017 codegen->MemoryFence(); 2018 } 2019 2020 if (type == DataType::Type::kReference) { 2021 bool value_can_be_null = true; // TODO: Worth finding out this information? 2022 codegen->MarkGCCard(locations->GetTemp(0).AsRegister<CpuRegister>(), 2023 locations->GetTemp(1).AsRegister<CpuRegister>(), 2024 base, 2025 value, 2026 value_can_be_null); 2027 } 2028 } 2029 2030 void IntrinsicCodeGeneratorX86_64::VisitUnsafePut(HInvoke* invoke) { 2031 GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt32, /* is_volatile= */ false, codegen_); 2032 } 2033 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutOrdered(HInvoke* invoke) { 2034 GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt32, /* is_volatile= */ false, codegen_); 2035 } 2036 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutVolatile(HInvoke* invoke) { 2037 GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt32, /* is_volatile= */ true, codegen_); 2038 } 2039 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObject(HInvoke* invoke) { 2040 GenUnsafePut( 2041 invoke->GetLocations(), DataType::Type::kReference, /* is_volatile= */ false, codegen_); 2042 } 2043 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObjectOrdered(HInvoke* invoke) { 2044 GenUnsafePut( 2045 invoke->GetLocations(), DataType::Type::kReference, /* is_volatile= */ false, codegen_); 2046 } 2047 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObjectVolatile(HInvoke* invoke) { 2048 GenUnsafePut( 2049 invoke->GetLocations(), DataType::Type::kReference, /* is_volatile= */ true, codegen_); 2050 } 2051 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLong(HInvoke* invoke) { 2052 GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt64, /* is_volatile= */ false, codegen_); 2053 } 2054 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLongOrdered(HInvoke* invoke) { 2055 GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt64, /* is_volatile= */ false, codegen_); 2056 } 2057 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLongVolatile(HInvoke* invoke) { 2058 GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt64, /* is_volatile= */ true, codegen_); 2059 } 2060 2061 static void CreateIntIntIntIntIntToInt(ArenaAllocator* allocator, 2062 DataType::Type type, 2063 HInvoke* invoke) { 2064 bool can_call = kEmitCompilerReadBarrier && 2065 kUseBakerReadBarrier && 2066 (invoke->GetIntrinsic() == Intrinsics::kUnsafeCASObject); 2067 LocationSummary* locations = 2068 new (allocator) LocationSummary(invoke, 2069 can_call 2070 ? LocationSummary::kCallOnSlowPath 2071 : LocationSummary::kNoCall, 2072 kIntrinsified); 2073 locations->SetInAt(0, Location::NoLocation()); // Unused receiver. 2074 locations->SetInAt(1, Location::RequiresRegister()); 2075 locations->SetInAt(2, Location::RequiresRegister()); 2076 // expected value must be in EAX/RAX. 2077 locations->SetInAt(3, Location::RegisterLocation(RAX)); 2078 locations->SetInAt(4, Location::RequiresRegister()); 2079 2080 locations->SetOut(Location::RequiresRegister()); 2081 if (type == DataType::Type::kReference) { 2082 // Need temporary registers for card-marking, and possibly for 2083 // (Baker) read barrier. 2084 locations->AddTemp(Location::RequiresRegister()); // Possibly used for reference poisoning too. 2085 locations->AddTemp(Location::RequiresRegister()); 2086 } 2087 } 2088 2089 void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASInt(HInvoke* invoke) { 2090 CreateIntIntIntIntIntToInt(allocator_, DataType::Type::kInt32, invoke); 2091 } 2092 2093 void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASLong(HInvoke* invoke) { 2094 CreateIntIntIntIntIntToInt(allocator_, DataType::Type::kInt64, invoke); 2095 } 2096 2097 void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASObject(HInvoke* invoke) { 2098 // The only read barrier implementation supporting the 2099 // UnsafeCASObject intrinsic is the Baker-style read barriers. 2100 if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) { 2101 return; 2102 } 2103 2104 CreateIntIntIntIntIntToInt(allocator_, DataType::Type::kReference, invoke); 2105 } 2106 2107 static void GenCAS(DataType::Type type, HInvoke* invoke, CodeGeneratorX86_64* codegen) { 2108 X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler()); 2109 LocationSummary* locations = invoke->GetLocations(); 2110 2111 CpuRegister base = locations->InAt(1).AsRegister<CpuRegister>(); 2112 CpuRegister offset = locations->InAt(2).AsRegister<CpuRegister>(); 2113 CpuRegister expected = locations->InAt(3).AsRegister<CpuRegister>(); 2114 // Ensure `expected` is in RAX (required by the CMPXCHG instruction). 2115 DCHECK_EQ(expected.AsRegister(), RAX); 2116 CpuRegister value = locations->InAt(4).AsRegister<CpuRegister>(); 2117 Location out_loc = locations->Out(); 2118 CpuRegister out = out_loc.AsRegister<CpuRegister>(); 2119 2120 if (type == DataType::Type::kReference) { 2121 // The only read barrier implementation supporting the 2122 // UnsafeCASObject intrinsic is the Baker-style read barriers. 2123 DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier); 2124 2125 CpuRegister temp1 = locations->GetTemp(0).AsRegister<CpuRegister>(); 2126 CpuRegister temp2 = locations->GetTemp(1).AsRegister<CpuRegister>(); 2127 2128 // Mark card for object assuming new value is stored. 2129 bool value_can_be_null = true; // TODO: Worth finding out this information? 2130 codegen->MarkGCCard(temp1, temp2, base, value, value_can_be_null); 2131 2132 // The address of the field within the holding object. 2133 Address field_addr(base, offset, ScaleFactor::TIMES_1, 0); 2134 2135 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { 2136 // Need to make sure the reference stored in the field is a to-space 2137 // one before attempting the CAS or the CAS could fail incorrectly. 2138 codegen->GenerateReferenceLoadWithBakerReadBarrier( 2139 invoke, 2140 out_loc, // Unused, used only as a "temporary" within the read barrier. 2141 base, 2142 field_addr, 2143 /* needs_null_check= */ false, 2144 /* always_update_field= */ true, 2145 &temp1, 2146 &temp2); 2147 } 2148 2149 bool base_equals_value = (base.AsRegister() == value.AsRegister()); 2150 Register value_reg = value.AsRegister(); 2151 if (kPoisonHeapReferences) { 2152 if (base_equals_value) { 2153 // If `base` and `value` are the same register location, move 2154 // `value_reg` to a temporary register. This way, poisoning 2155 // `value_reg` won't invalidate `base`. 2156 value_reg = temp1.AsRegister(); 2157 __ movl(CpuRegister(value_reg), base); 2158 } 2159 2160 // Check that the register allocator did not assign the location 2161 // of `expected` (RAX) to `value` nor to `base`, so that heap 2162 // poisoning (when enabled) works as intended below. 2163 // - If `value` were equal to `expected`, both references would 2164 // be poisoned twice, meaning they would not be poisoned at 2165 // all, as heap poisoning uses address negation. 2166 // - If `base` were equal to `expected`, poisoning `expected` 2167 // would invalidate `base`. 2168 DCHECK_NE(value_reg, expected.AsRegister()); 2169 DCHECK_NE(base.AsRegister(), expected.AsRegister()); 2170 2171 __ PoisonHeapReference(expected); 2172 __ PoisonHeapReference(CpuRegister(value_reg)); 2173 } 2174 2175 __ LockCmpxchgl(field_addr, CpuRegister(value_reg)); 2176 2177 // LOCK CMPXCHG has full barrier semantics, and we don't need 2178 // scheduling barriers at this time. 2179 2180 // Convert ZF into the Boolean result. 2181 __ setcc(kZero, out); 2182 __ movzxb(out, out); 2183 2184 // If heap poisoning is enabled, we need to unpoison the values 2185 // that were poisoned earlier. 2186 if (kPoisonHeapReferences) { 2187 if (base_equals_value) { 2188 // `value_reg` has been moved to a temporary register, no need 2189 // to unpoison it. 2190 } else { 2191 // Ensure `value` is different from `out`, so that unpoisoning 2192 // the former does not invalidate the latter. 2193 DCHECK_NE(value_reg, out.AsRegister()); 2194 __ UnpoisonHeapReference(CpuRegister(value_reg)); 2195 } 2196 // Ensure `expected` is different from `out`, so that unpoisoning 2197 // the former does not invalidate the latter. 2198 DCHECK_NE(expected.AsRegister(), out.AsRegister()); 2199 __ UnpoisonHeapReference(expected); 2200 } 2201 } else { 2202 if (type == DataType::Type::kInt32) { 2203 __ LockCmpxchgl(Address(base, offset, TIMES_1, 0), value); 2204 } else if (type == DataType::Type::kInt64) { 2205 __ LockCmpxchgq(Address(base, offset, TIMES_1, 0), value); 2206 } else { 2207 LOG(FATAL) << "Unexpected CAS type " << type; 2208 } 2209 2210 // LOCK CMPXCHG has full barrier semantics, and we don't need 2211 // scheduling barriers at this time. 2212 2213 // Convert ZF into the Boolean result. 2214 __ setcc(kZero, out); 2215 __ movzxb(out, out); 2216 } 2217 } 2218 2219 void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASInt(HInvoke* invoke) { 2220 GenCAS(DataType::Type::kInt32, invoke, codegen_); 2221 } 2222 2223 void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASLong(HInvoke* invoke) { 2224 GenCAS(DataType::Type::kInt64, invoke, codegen_); 2225 } 2226 2227 void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASObject(HInvoke* invoke) { 2228 // The only read barrier implementation supporting the 2229 // UnsafeCASObject intrinsic is the Baker-style read barriers. 2230 DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier); 2231 2232 GenCAS(DataType::Type::kReference, invoke, codegen_); 2233 } 2234 2235 void IntrinsicLocationsBuilderX86_64::VisitIntegerReverse(HInvoke* invoke) { 2236 LocationSummary* locations = 2237 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); 2238 locations->SetInAt(0, Location::RequiresRegister()); 2239 locations->SetOut(Location::SameAsFirstInput()); 2240 locations->AddTemp(Location::RequiresRegister()); 2241 } 2242 2243 static void SwapBits(CpuRegister reg, CpuRegister temp, int32_t shift, int32_t mask, 2244 X86_64Assembler* assembler) { 2245 Immediate imm_shift(shift); 2246 Immediate imm_mask(mask); 2247 __ movl(temp, reg); 2248 __ shrl(reg, imm_shift); 2249 __ andl(temp, imm_mask); 2250 __ andl(reg, imm_mask); 2251 __ shll(temp, imm_shift); 2252 __ orl(reg, temp); 2253 } 2254 2255 void IntrinsicCodeGeneratorX86_64::VisitIntegerReverse(HInvoke* invoke) { 2256 X86_64Assembler* assembler = GetAssembler(); 2257 LocationSummary* locations = invoke->GetLocations(); 2258 2259 CpuRegister reg = locations->InAt(0).AsRegister<CpuRegister>(); 2260 CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>(); 2261 2262 /* 2263 * Use one bswap instruction to reverse byte order first and then use 3 rounds of 2264 * swapping bits to reverse bits in a number x. Using bswap to save instructions 2265 * compared to generic luni implementation which has 5 rounds of swapping bits. 2266 * x = bswap x 2267 * x = (x & 0x55555555) << 1 | (x >> 1) & 0x55555555; 2268 * x = (x & 0x33333333) << 2 | (x >> 2) & 0x33333333; 2269 * x = (x & 0x0F0F0F0F) << 4 | (x >> 4) & 0x0F0F0F0F; 2270 */ 2271 __ bswapl(reg); 2272 SwapBits(reg, temp, 1, 0x55555555, assembler); 2273 SwapBits(reg, temp, 2, 0x33333333, assembler); 2274 SwapBits(reg, temp, 4, 0x0f0f0f0f, assembler); 2275 } 2276 2277 void IntrinsicLocationsBuilderX86_64::VisitLongReverse(HInvoke* invoke) { 2278 LocationSummary* locations = 2279 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); 2280 locations->SetInAt(0, Location::RequiresRegister()); 2281 locations->SetOut(Location::SameAsFirstInput()); 2282 locations->AddTemp(Location::RequiresRegister()); 2283 locations->AddTemp(Location::RequiresRegister()); 2284 } 2285 2286 static void SwapBits64(CpuRegister reg, CpuRegister temp, CpuRegister temp_mask, 2287 int32_t shift, int64_t mask, X86_64Assembler* assembler) { 2288 Immediate imm_shift(shift); 2289 __ movq(temp_mask, Immediate(mask)); 2290 __ movq(temp, reg); 2291 __ shrq(reg, imm_shift); 2292 __ andq(temp, temp_mask); 2293 __ andq(reg, temp_mask); 2294 __ shlq(temp, imm_shift); 2295 __ orq(reg, temp); 2296 } 2297 2298 void IntrinsicCodeGeneratorX86_64::VisitLongReverse(HInvoke* invoke) { 2299 X86_64Assembler* assembler = GetAssembler(); 2300 LocationSummary* locations = invoke->GetLocations(); 2301 2302 CpuRegister reg = locations->InAt(0).AsRegister<CpuRegister>(); 2303 CpuRegister temp1 = locations->GetTemp(0).AsRegister<CpuRegister>(); 2304 CpuRegister temp2 = locations->GetTemp(1).AsRegister<CpuRegister>(); 2305 2306 /* 2307 * Use one bswap instruction to reverse byte order first and then use 3 rounds of 2308 * swapping bits to reverse bits in a long number x. Using bswap to save instructions 2309 * compared to generic luni implementation which has 5 rounds of swapping bits. 2310 * x = bswap x 2311 * x = (x & 0x5555555555555555) << 1 | (x >> 1) & 0x5555555555555555; 2312 * x = (x & 0x3333333333333333) << 2 | (x >> 2) & 0x3333333333333333; 2313 * x = (x & 0x0F0F0F0F0F0F0F0F) << 4 | (x >> 4) & 0x0F0F0F0F0F0F0F0F; 2314 */ 2315 __ bswapq(reg); 2316 SwapBits64(reg, temp1, temp2, 1, INT64_C(0x5555555555555555), assembler); 2317 SwapBits64(reg, temp1, temp2, 2, INT64_C(0x3333333333333333), assembler); 2318 SwapBits64(reg, temp1, temp2, 4, INT64_C(0x0f0f0f0f0f0f0f0f), assembler); 2319 } 2320 2321 static void CreateBitCountLocations( 2322 ArenaAllocator* allocator, CodeGeneratorX86_64* codegen, HInvoke* invoke) { 2323 if (!codegen->GetInstructionSetFeatures().HasPopCnt()) { 2324 // Do nothing if there is no popcnt support. This results in generating 2325 // a call for the intrinsic rather than direct code. 2326 return; 2327 } 2328 LocationSummary* locations = 2329 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); 2330 locations->SetInAt(0, Location::Any()); 2331 locations->SetOut(Location::RequiresRegister()); 2332 } 2333 2334 static void GenBitCount(X86_64Assembler* assembler, 2335 CodeGeneratorX86_64* codegen, 2336 HInvoke* invoke, 2337 bool is_long) { 2338 LocationSummary* locations = invoke->GetLocations(); 2339 Location src = locations->InAt(0); 2340 CpuRegister out = locations->Out().AsRegister<CpuRegister>(); 2341 2342 if (invoke->InputAt(0)->IsConstant()) { 2343 // Evaluate this at compile time. 2344 int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant()); 2345 int32_t result = is_long 2346 ? POPCOUNT(static_cast<uint64_t>(value)) 2347 : POPCOUNT(static_cast<uint32_t>(value)); 2348 codegen->Load32BitValue(out, result); 2349 return; 2350 } 2351 2352 if (src.IsRegister()) { 2353 if (is_long) { 2354 __ popcntq(out, src.AsRegister<CpuRegister>()); 2355 } else { 2356 __ popcntl(out, src.AsRegister<CpuRegister>()); 2357 } 2358 } else if (is_long) { 2359 DCHECK(src.IsDoubleStackSlot()); 2360 __ popcntq(out, Address(CpuRegister(RSP), src.GetStackIndex())); 2361 } else { 2362 DCHECK(src.IsStackSlot()); 2363 __ popcntl(out, Address(CpuRegister(RSP), src.GetStackIndex())); 2364 } 2365 } 2366 2367 void IntrinsicLocationsBuilderX86_64::VisitIntegerBitCount(HInvoke* invoke) { 2368 CreateBitCountLocations(allocator_, codegen_, invoke); 2369 } 2370 2371 void IntrinsicCodeGeneratorX86_64::VisitIntegerBitCount(HInvoke* invoke) { 2372 GenBitCount(GetAssembler(), codegen_, invoke, /* is_long= */ false); 2373 } 2374 2375 void IntrinsicLocationsBuilderX86_64::VisitLongBitCount(HInvoke* invoke) { 2376 CreateBitCountLocations(allocator_, codegen_, invoke); 2377 } 2378 2379 void IntrinsicCodeGeneratorX86_64::VisitLongBitCount(HInvoke* invoke) { 2380 GenBitCount(GetAssembler(), codegen_, invoke, /* is_long= */ true); 2381 } 2382 2383 static void CreateOneBitLocations(ArenaAllocator* allocator, HInvoke* invoke, bool is_high) { 2384 LocationSummary* locations = 2385 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); 2386 locations->SetInAt(0, Location::Any()); 2387 locations->SetOut(Location::RequiresRegister()); 2388 locations->AddTemp(is_high ? Location::RegisterLocation(RCX) // needs CL 2389 : Location::RequiresRegister()); // any will do 2390 } 2391 2392 static void GenOneBit(X86_64Assembler* assembler, 2393 CodeGeneratorX86_64* codegen, 2394 HInvoke* invoke, 2395 bool is_high, bool is_long) { 2396 LocationSummary* locations = invoke->GetLocations(); 2397 Location src = locations->InAt(0); 2398 CpuRegister out = locations->Out().AsRegister<CpuRegister>(); 2399 2400 if (invoke->InputAt(0)->IsConstant()) { 2401 // Evaluate this at compile time. 2402 int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant()); 2403 if (value == 0) { 2404 __ xorl(out, out); // Clears upper bits too. 2405 return; 2406 } 2407 // Nonzero value. 2408 if (is_high) { 2409 value = is_long ? 63 - CLZ(static_cast<uint64_t>(value)) 2410 : 31 - CLZ(static_cast<uint32_t>(value)); 2411 } else { 2412 value = is_long ? CTZ(static_cast<uint64_t>(value)) 2413 : CTZ(static_cast<uint32_t>(value)); 2414 } 2415 if (is_long) { 2416 codegen->Load64BitValue(out, 1ULL << value); 2417 } else { 2418 codegen->Load32BitValue(out, 1 << value); 2419 } 2420 return; 2421 } 2422 2423 // Handle the non-constant cases. 2424 if (!is_high && codegen->GetInstructionSetFeatures().HasAVX2() && 2425 src.IsRegister()) { 2426 __ blsi(out, src.AsRegister<CpuRegister>()); 2427 } else { 2428 CpuRegister tmp = locations->GetTemp(0).AsRegister<CpuRegister>(); 2429 if (is_high) { 2430 // Use architectural support: basically 1 << bsr. 2431 if (src.IsRegister()) { 2432 if (is_long) { 2433 __ bsrq(tmp, src.AsRegister<CpuRegister>()); 2434 } else { 2435 __ bsrl(tmp, src.AsRegister<CpuRegister>()); 2436 } 2437 } else if (is_long) { 2438 DCHECK(src.IsDoubleStackSlot()); 2439 __ bsrq(tmp, Address(CpuRegister(RSP), src.GetStackIndex())); 2440 } else { 2441 DCHECK(src.IsStackSlot()); 2442 __ bsrl(tmp, Address(CpuRegister(RSP), src.GetStackIndex())); 2443 } 2444 // BSR sets ZF if the input was zero. 2445 NearLabel is_zero, done; 2446 __ j(kEqual, &is_zero); 2447 __ movl(out, Immediate(1)); // Clears upper bits too. 2448 if (is_long) { 2449 __ shlq(out, tmp); 2450 } else { 2451 __ shll(out, tmp); 2452 } 2453 __ jmp(&done); 2454 __ Bind(&is_zero); 2455 __ xorl(out, out); // Clears upper bits too. 2456 __ Bind(&done); 2457 } else { 2458 // Copy input into temporary. 2459 if (src.IsRegister()) { 2460 if (is_long) { 2461 __ movq(tmp, src.AsRegister<CpuRegister>()); 2462 } else { 2463 __ movl(tmp, src.AsRegister<CpuRegister>()); 2464 } 2465 } else if (is_long) { 2466 DCHECK(src.IsDoubleStackSlot()); 2467 __ movq(tmp, Address(CpuRegister(RSP), src.GetStackIndex())); 2468 } else { 2469 DCHECK(src.IsStackSlot()); 2470 __ movl(tmp, Address(CpuRegister(RSP), src.GetStackIndex())); 2471 } 2472 // Do the bit twiddling: basically tmp & -tmp; 2473 if (is_long) { 2474 __ movq(out, tmp); 2475 __ negq(tmp); 2476 __ andq(out, tmp); 2477 } else { 2478 __ movl(out, tmp); 2479 __ negl(tmp); 2480 __ andl(out, tmp); 2481 } 2482 } 2483 } 2484 } 2485 2486 void IntrinsicLocationsBuilderX86_64::VisitIntegerHighestOneBit(HInvoke* invoke) { 2487 CreateOneBitLocations(allocator_, invoke, /* is_high= */ true); 2488 } 2489 2490 void IntrinsicCodeGeneratorX86_64::VisitIntegerHighestOneBit(HInvoke* invoke) { 2491 GenOneBit(GetAssembler(), codegen_, invoke, /* is_high= */ true, /* is_long= */ false); 2492 } 2493 2494 void IntrinsicLocationsBuilderX86_64::VisitLongHighestOneBit(HInvoke* invoke) { 2495 CreateOneBitLocations(allocator_, invoke, /* is_high= */ true); 2496 } 2497 2498 void IntrinsicCodeGeneratorX86_64::VisitLongHighestOneBit(HInvoke* invoke) { 2499 GenOneBit(GetAssembler(), codegen_, invoke, /* is_high= */ true, /* is_long= */ true); 2500 } 2501 2502 void IntrinsicLocationsBuilderX86_64::VisitIntegerLowestOneBit(HInvoke* invoke) { 2503 CreateOneBitLocations(allocator_, invoke, /* is_high= */ false); 2504 } 2505 2506 void IntrinsicCodeGeneratorX86_64::VisitIntegerLowestOneBit(HInvoke* invoke) { 2507 GenOneBit(GetAssembler(), codegen_, invoke, /* is_high= */ false, /* is_long= */ false); 2508 } 2509 2510 void IntrinsicLocationsBuilderX86_64::VisitLongLowestOneBit(HInvoke* invoke) { 2511 CreateOneBitLocations(allocator_, invoke, /* is_high= */ false); 2512 } 2513 2514 void IntrinsicCodeGeneratorX86_64::VisitLongLowestOneBit(HInvoke* invoke) { 2515 GenOneBit(GetAssembler(), codegen_, invoke, /* is_high= */ false, /* is_long= */ true); 2516 } 2517 2518 static void CreateLeadingZeroLocations(ArenaAllocator* allocator, HInvoke* invoke) { 2519 LocationSummary* locations = 2520 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); 2521 locations->SetInAt(0, Location::Any()); 2522 locations->SetOut(Location::RequiresRegister()); 2523 } 2524 2525 static void GenLeadingZeros(X86_64Assembler* assembler, 2526 CodeGeneratorX86_64* codegen, 2527 HInvoke* invoke, bool is_long) { 2528 LocationSummary* locations = invoke->GetLocations(); 2529 Location src = locations->InAt(0); 2530 CpuRegister out = locations->Out().AsRegister<CpuRegister>(); 2531 2532 int zero_value_result = is_long ? 64 : 32; 2533 if (invoke->InputAt(0)->IsConstant()) { 2534 // Evaluate this at compile time. 2535 int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant()); 2536 if (value == 0) { 2537 value = zero_value_result; 2538 } else { 2539 value = is_long ? CLZ(static_cast<uint64_t>(value)) : CLZ(static_cast<uint32_t>(value)); 2540 } 2541 codegen->Load32BitValue(out, value); 2542 return; 2543 } 2544 2545 // Handle the non-constant cases. 2546 if (src.IsRegister()) { 2547 if (is_long) { 2548 __ bsrq(out, src.AsRegister<CpuRegister>()); 2549 } else { 2550 __ bsrl(out, src.AsRegister<CpuRegister>()); 2551 } 2552 } else if (is_long) { 2553 DCHECK(src.IsDoubleStackSlot()); 2554 __ bsrq(out, Address(CpuRegister(RSP), src.GetStackIndex())); 2555 } else { 2556 DCHECK(src.IsStackSlot()); 2557 __ bsrl(out, Address(CpuRegister(RSP), src.GetStackIndex())); 2558 } 2559 2560 // BSR sets ZF if the input was zero, and the output is undefined. 2561 NearLabel is_zero, done; 2562 __ j(kEqual, &is_zero); 2563 2564 // Correct the result from BSR to get the CLZ result. 2565 __ xorl(out, Immediate(zero_value_result - 1)); 2566 __ jmp(&done); 2567 2568 // Fix the zero case with the expected result. 2569 __ Bind(&is_zero); 2570 __ movl(out, Immediate(zero_value_result)); 2571 2572 __ Bind(&done); 2573 } 2574 2575 void IntrinsicLocationsBuilderX86_64::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) { 2576 CreateLeadingZeroLocations(allocator_, invoke); 2577 } 2578 2579 void IntrinsicCodeGeneratorX86_64::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) { 2580 GenLeadingZeros(GetAssembler(), codegen_, invoke, /* is_long= */ false); 2581 } 2582 2583 void IntrinsicLocationsBuilderX86_64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) { 2584 CreateLeadingZeroLocations(allocator_, invoke); 2585 } 2586 2587 void IntrinsicCodeGeneratorX86_64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) { 2588 GenLeadingZeros(GetAssembler(), codegen_, invoke, /* is_long= */ true); 2589 } 2590 2591 static void CreateTrailingZeroLocations(ArenaAllocator* allocator, HInvoke* invoke) { 2592 LocationSummary* locations = 2593 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); 2594 locations->SetInAt(0, Location::Any()); 2595 locations->SetOut(Location::RequiresRegister()); 2596 } 2597 2598 static void GenTrailingZeros(X86_64Assembler* assembler, 2599 CodeGeneratorX86_64* codegen, 2600 HInvoke* invoke, bool is_long) { 2601 LocationSummary* locations = invoke->GetLocations(); 2602 Location src = locations->InAt(0); 2603 CpuRegister out = locations->Out().AsRegister<CpuRegister>(); 2604 2605 int zero_value_result = is_long ? 64 : 32; 2606 if (invoke->InputAt(0)->IsConstant()) { 2607 // Evaluate this at compile time. 2608 int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant()); 2609 if (value == 0) { 2610 value = zero_value_result; 2611 } else { 2612 value = is_long ? CTZ(static_cast<uint64_t>(value)) : CTZ(static_cast<uint32_t>(value)); 2613 } 2614 codegen->Load32BitValue(out, value); 2615 return; 2616 } 2617 2618 // Handle the non-constant cases. 2619 if (src.IsRegister()) { 2620 if (is_long) { 2621 __ bsfq(out, src.AsRegister<CpuRegister>()); 2622 } else { 2623 __ bsfl(out, src.AsRegister<CpuRegister>()); 2624 } 2625 } else if (is_long) { 2626 DCHECK(src.IsDoubleStackSlot()); 2627 __ bsfq(out, Address(CpuRegister(RSP), src.GetStackIndex())); 2628 } else { 2629 DCHECK(src.IsStackSlot()); 2630 __ bsfl(out, Address(CpuRegister(RSP), src.GetStackIndex())); 2631 } 2632 2633 // BSF sets ZF if the input was zero, and the output is undefined. 2634 NearLabel done; 2635 __ j(kNotEqual, &done); 2636 2637 // Fix the zero case with the expected result. 2638 __ movl(out, Immediate(zero_value_result)); 2639 2640 __ Bind(&done); 2641 } 2642 2643 void IntrinsicLocationsBuilderX86_64::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) { 2644 CreateTrailingZeroLocations(allocator_, invoke); 2645 } 2646 2647 void IntrinsicCodeGeneratorX86_64::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) { 2648 GenTrailingZeros(GetAssembler(), codegen_, invoke, /* is_long= */ false); 2649 } 2650 2651 void IntrinsicLocationsBuilderX86_64::VisitLongNumberOfTrailingZeros(HInvoke* invoke) { 2652 CreateTrailingZeroLocations(allocator_, invoke); 2653 } 2654 2655 void IntrinsicCodeGeneratorX86_64::VisitLongNumberOfTrailingZeros(HInvoke* invoke) { 2656 GenTrailingZeros(GetAssembler(), codegen_, invoke, /* is_long= */ true); 2657 } 2658 2659 void IntrinsicLocationsBuilderX86_64::VisitIntegerValueOf(HInvoke* invoke) { 2660 InvokeRuntimeCallingConvention calling_convention; 2661 IntrinsicVisitor::ComputeIntegerValueOfLocations( 2662 invoke, 2663 codegen_, 2664 Location::RegisterLocation(RAX), 2665 Location::RegisterLocation(calling_convention.GetRegisterAt(0))); 2666 } 2667 2668 void IntrinsicCodeGeneratorX86_64::VisitIntegerValueOf(HInvoke* invoke) { 2669 IntrinsicVisitor::IntegerValueOfInfo info = 2670 IntrinsicVisitor::ComputeIntegerValueOfInfo(invoke, codegen_->GetCompilerOptions()); 2671 LocationSummary* locations = invoke->GetLocations(); 2672 X86_64Assembler* assembler = GetAssembler(); 2673 2674 CpuRegister out = locations->Out().AsRegister<CpuRegister>(); 2675 InvokeRuntimeCallingConvention calling_convention; 2676 CpuRegister argument = CpuRegister(calling_convention.GetRegisterAt(0)); 2677 if (invoke->InputAt(0)->IsIntConstant()) { 2678 int32_t value = invoke->InputAt(0)->AsIntConstant()->GetValue(); 2679 if (static_cast<uint32_t>(value - info.low) < info.length) { 2680 // Just embed the j.l.Integer in the code. 2681 DCHECK_NE(info.value_boot_image_reference, IntegerValueOfInfo::kInvalidReference); 2682 codegen_->LoadBootImageAddress(out, info.value_boot_image_reference); 2683 } else { 2684 DCHECK(locations->CanCall()); 2685 // Allocate and initialize a new j.l.Integer. 2686 // TODO: If we JIT, we could allocate the j.l.Integer now, and store it in the 2687 // JIT object table. 2688 codegen_->AllocateInstanceForIntrinsic(invoke->AsInvokeStaticOrDirect(), 2689 info.integer_boot_image_offset); 2690 __ movl(Address(out, info.value_offset), Immediate(value)); 2691 } 2692 } else { 2693 DCHECK(locations->CanCall()); 2694 CpuRegister in = locations->InAt(0).AsRegister<CpuRegister>(); 2695 // Check bounds of our cache. 2696 __ leal(out, Address(in, -info.low)); 2697 __ cmpl(out, Immediate(info.length)); 2698 NearLabel allocate, done; 2699 __ j(kAboveEqual, &allocate); 2700 // If the value is within the bounds, load the j.l.Integer directly from the array. 2701 DCHECK_NE(out.AsRegister(), argument.AsRegister()); 2702 codegen_->LoadBootImageAddress(argument, info.array_data_boot_image_reference); 2703 static_assert((1u << TIMES_4) == sizeof(mirror::HeapReference<mirror::Object>), 2704 "Check heap reference size."); 2705 __ movl(out, Address(argument, out, TIMES_4, 0)); 2706 __ MaybeUnpoisonHeapReference(out); 2707 __ jmp(&done); 2708 __ Bind(&allocate); 2709 // Otherwise allocate and initialize a new j.l.Integer. 2710 codegen_->AllocateInstanceForIntrinsic(invoke->AsInvokeStaticOrDirect(), 2711 info.integer_boot_image_offset); 2712 __ movl(Address(out, info.value_offset), in); 2713 __ Bind(&done); 2714 } 2715 } 2716 2717 void IntrinsicLocationsBuilderX86_64::VisitThreadInterrupted(HInvoke* invoke) { 2718 LocationSummary* locations = 2719 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); 2720 locations->SetOut(Location::RequiresRegister()); 2721 } 2722 2723 void IntrinsicCodeGeneratorX86_64::VisitThreadInterrupted(HInvoke* invoke) { 2724 X86_64Assembler* assembler = GetAssembler(); 2725 CpuRegister out = invoke->GetLocations()->Out().AsRegister<CpuRegister>(); 2726 Address address = Address::Absolute 2727 (Thread::InterruptedOffset<kX86_64PointerSize>().Int32Value(), /* no_rip= */ true); 2728 NearLabel done; 2729 __ gs()->movl(out, address); 2730 __ testl(out, out); 2731 __ j(kEqual, &done); 2732 __ gs()->movl(address, Immediate(0)); 2733 codegen_->MemoryFence(); 2734 __ Bind(&done); 2735 } 2736 2737 void IntrinsicLocationsBuilderX86_64::VisitReachabilityFence(HInvoke* invoke) { 2738 LocationSummary* locations = 2739 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); 2740 locations->SetInAt(0, Location::Any()); 2741 } 2742 2743 void IntrinsicCodeGeneratorX86_64::VisitReachabilityFence(HInvoke* invoke ATTRIBUTE_UNUSED) { } 2744 2745 UNIMPLEMENTED_INTRINSIC(X86_64, ReferenceGetReferent) 2746 UNIMPLEMENTED_INTRINSIC(X86_64, FloatIsInfinite) 2747 UNIMPLEMENTED_INTRINSIC(X86_64, DoubleIsInfinite) 2748 UNIMPLEMENTED_INTRINSIC(X86_64, CRC32Update) 2749 UNIMPLEMENTED_INTRINSIC(X86_64, CRC32UpdateBytes) 2750 UNIMPLEMENTED_INTRINSIC(X86_64, CRC32UpdateByteBuffer) 2751 UNIMPLEMENTED_INTRINSIC(X86_64, FP16ToFloat) 2752 UNIMPLEMENTED_INTRINSIC(X86_64, FP16ToHalf) 2753 UNIMPLEMENTED_INTRINSIC(X86_64, FP16Floor) 2754 UNIMPLEMENTED_INTRINSIC(X86_64, FP16Ceil) 2755 UNIMPLEMENTED_INTRINSIC(X86_64, FP16Rint) 2756 UNIMPLEMENTED_INTRINSIC(X86_64, FP16Greater) 2757 UNIMPLEMENTED_INTRINSIC(X86_64, FP16GreaterEquals) 2758 UNIMPLEMENTED_INTRINSIC(X86_64, FP16Less) 2759 UNIMPLEMENTED_INTRINSIC(X86_64, FP16LessEquals) 2760 2761 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOf); 2762 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOfAfter); 2763 UNIMPLEMENTED_INTRINSIC(X86_64, StringBufferAppend); 2764 UNIMPLEMENTED_INTRINSIC(X86_64, StringBufferLength); 2765 UNIMPLEMENTED_INTRINSIC(X86_64, StringBufferToString); 2766 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderAppendObject); 2767 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderAppendString); 2768 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderAppendCharSequence); 2769 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderAppendCharArray); 2770 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderAppendBoolean); 2771 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderAppendChar); 2772 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderAppendInt); 2773 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderAppendLong); 2774 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderAppendFloat); 2775 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderAppendDouble); 2776 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderLength); 2777 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderToString); 2778 2779 // 1.8. 2780 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndAddInt) 2781 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndAddLong) 2782 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndSetInt) 2783 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndSetLong) 2784 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndSetObject) 2785 2786 UNREACHABLE_INTRINSICS(X86_64) 2787 2788 #undef __ 2789 2790 } // namespace x86_64 2791 } // namespace art 2792