4 files changed, 112 insertions, 39 deletions
diff --git a/deps/v8/src/compiler/backend/arm/code-generator-arm.cc b/deps/v8/src/compiler/backend/arm/code-generator-arm.cc
index 65a569d755..3fe5361083 100644
--- a/deps/v8/src/compiler/backend/arm/code-generator-arm.cc
+++ b/deps/v8/src/compiler/backend/arm/code-generator-arm.cc
@@ -44,7 +44,7 @@ class ArmOperandConverter final : public InstructionOperandConverter {
     UNREACHABLE();
   }
 
-  Operand InputImmediate(size_t index) {
+  Operand InputImmediate(size_t index) const {
     return ToImmediate(instr_->InputAt(index));
   }
 
@@ -111,7 +111,7 @@ class ArmOperandConverter final : public InstructionOperandConverter {
     return InputOffset(&first_index);
   }
 
-  Operand ToImmediate(InstructionOperand* operand) {
+  Operand ToImmediate(InstructionOperand* operand) const {
     Constant constant = ToConstant(operand);
     switch (constant.type()) {
       case Constant::kInt32:
@@ -153,9 +153,6 @@ class ArmOperandConverter final : public InstructionOperandConverter {
   NeonMemOperand NeonInputOperand(size_t first_index) {
     const size_t index = first_index;
     switch (AddressingModeField::decode(instr_->opcode())) {
-      case kMode_Offset_RR:
-        return NeonMemOperand(InputRegister(index + 0),
-                              InputRegister(index + 1));
       case kMode_Operand2_R:
         return NeonMemOperand(InputRegister(index + 0));
       default:
@@ -309,9 +306,9 @@ Condition FlagsConditionToCondition(FlagsCondition condition) {
   UNREACHABLE();
 }
 
-void EmitWordLoadPoisoningIfNeeded(
-    CodeGenerator* codegen, InstructionCode opcode,
-    ArmOperandConverter& i) {  // NOLINT(runtime/references)
+void EmitWordLoadPoisoningIfNeeded(CodeGenerator* codegen,
+                                   InstructionCode opcode,
+                                   ArmOperandConverter const& i) {
   const MemoryAccessMode access_mode =
       static_cast<MemoryAccessMode>(MiscField::decode(opcode));
   if (access_mode == kMemoryAccessPoisoned) {
@@ -320,10 +317,10 @@ void EmitWordLoadPoisoningIfNeeded(
   }
 }
 
-void ComputePoisonedAddressForLoad(
-    CodeGenerator* codegen, InstructionCode opcode,
-    ArmOperandConverter& i,  // NOLINT(runtime/references)
-    Register address) {
+void ComputePoisonedAddressForLoad(CodeGenerator* codegen,
+                                   InstructionCode opcode,
+                                   ArmOperandConverter const& i,
+                                   Register address) {
   DCHECK_EQ(kMemoryAccessPoisoned,
             static_cast<MemoryAccessMode>(MiscField::decode(opcode)));
   switch (AddressingModeField::decode(opcode)) {
@@ -1798,6 +1795,19 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
       __ vneg(i.OutputSimd128Register(), i.InputSimd128Register(0));
       break;
     }
+    case kArmF32x4Sqrt: {
+      QwNeonRegister dst = i.OutputSimd128Register();
+      QwNeonRegister src1 = i.InputSimd128Register(0);
+      DCHECK_EQ(dst, q0);
+      DCHECK_EQ(src1, q0);
+#define S_FROM_Q(reg, lane) SwVfpRegister::from_code(reg.code() * 4 + lane)
+      __ vsqrt(S_FROM_Q(dst, 0), S_FROM_Q(src1, 0));
+      __ vsqrt(S_FROM_Q(dst, 1), S_FROM_Q(src1, 1));
+      __ vsqrt(S_FROM_Q(dst, 2), S_FROM_Q(src1, 2));
+      __ vsqrt(S_FROM_Q(dst, 3), S_FROM_Q(src1, 3));
+#undef S_FROM_Q
+      break;
+    }
     case kArmF32x4RecipApprox: {
       __ vrecpe(i.OutputSimd128Register(), i.InputSimd128Register(0));
       break;
@@ -1919,14 +1929,20 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
     }
     case kArmI32x4Shl: {
       QwNeonRegister tmp = i.TempSimd128Register(0);
-      __ vdup(Neon32, tmp, i.InputRegister(1));
+      Register shift = i.TempRegister(1);
+      // Take shift value modulo 32.
+      __ and_(shift, i.InputRegister(1), Operand(31));
+      __ vdup(Neon32, tmp, shift);
       __ vshl(NeonS32, i.OutputSimd128Register(), i.InputSimd128Register(0),
               tmp);
       break;
     }
     case kArmI32x4ShrS: {
       QwNeonRegister tmp = i.TempSimd128Register(0);
-      __ vdup(Neon32, tmp, i.InputRegister(1));
+      Register shift = i.TempRegister(1);
+      // Take shift value modulo 32.
+      __ and_(shift, i.InputRegister(1), Operand(31));
+      __ vdup(Neon32, tmp, shift);
       __ vneg(Neon32, tmp, tmp);
       __ vshl(NeonS32, i.OutputSimd128Register(), i.InputSimd128Register(0),
               tmp);
@@ -1998,7 +2014,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
     }
     case kArmI32x4ShrU: {
       QwNeonRegister tmp = i.TempSimd128Register(0);
-      __ vdup(Neon32, tmp, i.InputRegister(1));
+      Register shift = i.TempRegister(1);
+      // Take shift value modulo 32.
+      __ and_(shift, i.InputRegister(1), Operand(31));
+      __ vdup(Neon32, tmp, shift);
       __ vneg(Neon32, tmp, tmp);
       __ vshl(NeonU32, i.OutputSimd128Register(), i.InputSimd128Register(0),
               tmp);
@@ -2029,7 +2048,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
       break;
     }
     case kArmI16x8ExtractLane: {
-      __ ExtractLane(i.OutputRegister(), i.InputSimd128Register(0), NeonS16,
+      __ ExtractLane(i.OutputRegister(), i.InputSimd128Register(0), NeonU16,
                      i.InputInt8(1));
       break;
     }
@@ -2054,14 +2073,20 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
     }
     case kArmI16x8Shl: {
       QwNeonRegister tmp = i.TempSimd128Register(0);
-      __ vdup(Neon16, tmp, i.InputRegister(1));
+      Register shift = i.TempRegister(1);
+      // Take shift value modulo 16.
+      __ and_(shift, i.InputRegister(1), Operand(15));
+      __ vdup(Neon16, tmp, shift);
       __ vshl(NeonS16, i.OutputSimd128Register(), i.InputSimd128Register(0),
               tmp);
       break;
     }
     case kArmI16x8ShrS: {
       QwNeonRegister tmp = i.TempSimd128Register(0);
-      __ vdup(Neon16, tmp, i.InputRegister(1));
+      Register shift = i.TempRegister(1);
+      // Take shift value modulo 16.
+      __ and_(shift, i.InputRegister(1), Operand(15));
+      __ vdup(Neon16, tmp, shift);
       __ vneg(Neon16, tmp, tmp);
       __ vshl(NeonS16, i.OutputSimd128Register(), i.InputSimd128Register(0),
               tmp);
@@ -2142,7 +2167,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
     }
     case kArmI16x8ShrU: {
       QwNeonRegister tmp = i.TempSimd128Register(0);
-      __ vdup(Neon16, tmp, i.InputRegister(1));
+      Register shift = i.TempRegister(1);
+      // Take shift value modulo 16.
+      __ and_(shift, i.InputRegister(1), Operand(15));
+      __ vdup(Neon16, tmp, shift);
       __ vneg(Neon16, tmp, tmp);
       __ vshl(NeonU16, i.OutputSimd128Register(), i.InputSimd128Register(0),
               tmp);
@@ -2186,7 +2214,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
       break;
     }
     case kArmI8x16ExtractLane: {
-      __ ExtractLane(i.OutputRegister(), i.InputSimd128Register(0), NeonS8,
+      __ ExtractLane(i.OutputRegister(), i.InputSimd128Register(0), NeonU8,
                      i.InputInt8(1));
       break;
     }
@@ -2201,6 +2229,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
     }
     case kArmI8x16Shl: {
       QwNeonRegister tmp = i.TempSimd128Register(0);
+      Register shift = i.TempRegister(1);
+      // Take shift value modulo 8.
+      __ and_(shift, i.InputRegister(1), Operand(7));
       __ vdup(Neon8, tmp, i.InputRegister(1));
       __ vshl(NeonS8, i.OutputSimd128Register(), i.InputSimd128Register(0),
               tmp);
@@ -2208,7 +2239,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
     }
     case kArmI8x16ShrS: {
       QwNeonRegister tmp = i.TempSimd128Register(0);
-      __ vdup(Neon8, tmp, i.InputRegister(1));
+      Register shift = i.TempRegister(1);
+      // Take shift value modulo 8.
+      __ and_(shift, i.InputRegister(1), Operand(7));
+      __ vdup(Neon8, tmp, shift);
       __ vneg(Neon8, tmp, tmp);
       __ vshl(NeonS8, i.OutputSimd128Register(), i.InputSimd128Register(0),
               tmp);
@@ -2275,7 +2309,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
     }
     case kArmI8x16ShrU: {
       QwNeonRegister tmp = i.TempSimd128Register(0);
-      __ vdup(Neon8, tmp, i.InputRegister(1));
+      Register shift = i.TempRegister(1);
+      // Take shift value modulo 8.
+      __ and_(shift, i.InputRegister(1), Operand(7));
+      __ vdup(Neon8, tmp, shift);
       __ vneg(Neon8, tmp, tmp);
       __ vshl(NeonU8, i.OutputSimd128Register(), i.InputSimd128Register(0),
               tmp);
diff --git a/deps/v8/src/compiler/backend/arm/instruction-codes-arm.h b/deps/v8/src/compiler/backend/arm/instruction-codes-arm.h
index 3551e26aea..d398ec0ed6 100644
--- a/deps/v8/src/compiler/backend/arm/instruction-codes-arm.h
+++ b/deps/v8/src/compiler/backend/arm/instruction-codes-arm.h
@@ -135,6 +135,7 @@ namespace compiler {
   V(ArmF32x4UConvertI32x4)         \
   V(ArmF32x4Abs)                   \
   V(ArmF32x4Neg)                   \
+  V(ArmF32x4Sqrt)                  \
   V(ArmF32x4RecipApprox)           \
   V(ArmF32x4RecipSqrtApprox)       \
   V(ArmF32x4Add)                   \
diff --git a/deps/v8/src/compiler/backend/arm/instruction-scheduler-arm.cc b/deps/v8/src/compiler/backend/arm/instruction-scheduler-arm.cc
index 1d7cf61dfe..92be55dcc3 100644
--- a/deps/v8/src/compiler/backend/arm/instruction-scheduler-arm.cc
+++ b/deps/v8/src/compiler/backend/arm/instruction-scheduler-arm.cc
@@ -115,6 +115,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
     case kArmF32x4UConvertI32x4:
     case kArmF32x4Abs:
     case kArmF32x4Neg:
+    case kArmF32x4Sqrt:
     case kArmF32x4RecipApprox:
     case kArmF32x4RecipSqrtApprox:
     case kArmF32x4Add:
diff --git a/deps/v8/src/compiler/backend/arm/instruction-selector-arm.cc b/deps/v8/src/compiler/backend/arm/instruction-selector-arm.cc
index ce74faa4a6..303648051f 100644
--- a/deps/v8/src/compiler/backend/arm/instruction-selector-arm.cc
+++ b/deps/v8/src/compiler/backend/arm/instruction-selector-arm.cc
@@ -2,9 +2,9 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
-#include "src/base/adapters.h"
 #include "src/base/bits.h"
 #include "src/base/enum-set.h"
+#include "src/base/iterator.h"
 #include "src/compiler/backend/instruction-selector-impl.h"
 #include "src/compiler/node-matchers.h"
 #include "src/compiler/node-properties.h"
@@ -94,7 +94,7 @@ void VisitRRR(InstructionSelector* selector, ArchOpcode opcode, Node* node) {
 void VisitSimdShiftRRR(InstructionSelector* selector, ArchOpcode opcode,
                        Node* node) {
   ArmOperandGenerator g(selector);
-  InstructionOperand temps[] = {g.TempSimd128Register()};
+  InstructionOperand temps[] = {g.TempSimd128Register(), g.TempRegister()};
   selector->Emit(opcode, g.DefineAsRegister(node),
                  g.UseRegister(node->InputAt(0)),
                  g.UseRegister(node->InputAt(1)), arraysize(temps), temps);
@@ -352,6 +352,26 @@ void VisitMod(InstructionSelector* selector, Node* node, ArchOpcode div_opcode,
   }
 }
 
+// Adds the base and offset into a register, then change the addressing
+// mode of opcode_return to use this register. Certain instructions, e.g.
+// vld1 and vst1, when given two registers, will post-increment the offset, i.e.
+// perform the operation at base, then add offset to base. What we intend is to
+// access at (base+offset).
+void EmitAddBeforeS128LoadStore(InstructionSelector* selector,
+                                InstructionCode* opcode_return,
+                                size_t* input_count_return,
+                                InstructionOperand* inputs) {
+  DCHECK(*opcode_return == kArmVld1S128 || *opcode_return == kArmVst1S128);
+  ArmOperandGenerator g(selector);
+  InstructionOperand addr = g.TempRegister();
+  InstructionCode op = kArmAdd;
+  op |= AddressingModeField::encode(kMode_Operand2_R);
+  selector->Emit(op, 1, &addr, 2, inputs);
+  *opcode_return |= AddressingModeField::encode(kMode_Operand2_R);
+  *input_count_return -= 1;
+  inputs[0] = addr;
+}
+
 void EmitLoad(InstructionSelector* selector, InstructionCode opcode,
               InstructionOperand* output, Node* base, Node* index) {
   ArmOperandGenerator g(selector);
@@ -368,7 +388,11 @@ void EmitLoad(InstructionSelector* selector, InstructionCode opcode,
     input_count = 3;
   } else {
     inputs[1] = g.UseRegister(index);
-    opcode |= AddressingModeField::encode(kMode_Offset_RR);
+    if (opcode == kArmVld1S128) {
+      EmitAddBeforeS128LoadStore(selector, &opcode, &input_count, &inputs[0]);
+    } else {
+      opcode |= AddressingModeField::encode(kMode_Offset_RR);
+    }
   }
   selector->Emit(opcode, 1, output, input_count, inputs);
 }
@@ -386,7 +410,12 @@ void EmitStore(InstructionSelector* selector, InstructionCode opcode,
     input_count = 4;
   } else {
     inputs[input_count++] = g.UseRegister(index);
-    opcode |= AddressingModeField::encode(kMode_Offset_RR);
+    if (opcode == kArmVst1S128) {
+      // Inputs are value, base, index, only care about base and index.
+      EmitAddBeforeS128LoadStore(selector, &opcode, &input_count, &inputs[1]);
+    } else {
+      opcode |= AddressingModeField::encode(kMode_Offset_RR);
+    }
   }
   selector->Emit(opcode, 0, nullptr, input_count, inputs);
 }
@@ -596,8 +625,7 @@ void InstructionSelector::VisitUnalignedLoad(Node* node) {
       Emit(kArmVmovF32U32, g.DefineAsRegister(node), temp);
       return;
     }
-    case MachineRepresentation::kFloat64:
-    case MachineRepresentation::kSimd128: {
+    case MachineRepresentation::kFloat64: {
       // Compute the address of the least-significant byte of the FP value.
       // We assume that the base node is unlikely to be an encodable immediate
       // or the result of a shift operation, so only consider the addressing
@@ -623,13 +651,10 @@ void InstructionSelector::VisitUnalignedLoad(Node* node) {
 
       if (CpuFeatures::IsSupported(NEON)) {
         // With NEON we can load directly from the calculated address.
-        InstructionCode op = load_rep == MachineRepresentation::kFloat64
-                                 ? kArmVld1F64
-                                 : kArmVld1S128;
+        InstructionCode op = kArmVld1F64;
         op |= AddressingModeField::encode(kMode_Operand2_R);
         Emit(op, g.DefineAsRegister(node), addr);
       } else {
-        DCHECK_NE(MachineRepresentation::kSimd128, load_rep);
         // Load both halves and move to an FP register.
         InstructionOperand fp_lo = g.TempRegister();
         InstructionOperand fp_hi = g.TempRegister();
@@ -670,8 +695,7 @@ void InstructionSelector::VisitUnalignedStore(Node* node) {
       EmitStore(this, kArmStr, input_count, inputs, index);
       return;
     }
-    case MachineRepresentation::kFloat64:
-    case MachineRepresentation::kSimd128: {
+    case MachineRepresentation::kFloat64: {
       if (CpuFeatures::IsSupported(NEON)) {
         InstructionOperand address = g.TempRegister();
         {
@@ -697,13 +721,10 @@ void InstructionSelector::VisitUnalignedStore(Node* node) {
 
         inputs[input_count++] = g.UseRegister(value);
         inputs[input_count++] = address;
-        InstructionCode op = store_rep == MachineRepresentation::kFloat64
-                                 ? kArmVst1F64
-                                 : kArmVst1S128;
+        InstructionCode op = kArmVst1F64;
         op |= AddressingModeField::encode(kMode_Operand2_R);
         Emit(op, 0, nullptr, input_count, inputs);
       } else {
-        DCHECK_NE(MachineRepresentation::kSimd128, store_rep);
         // Store a 64-bit floating point value using two 32-bit integer stores.
         // Computing the store address here would require three live temporary
         // registers (fp<63:32>, fp<31:0>, address), so compute base + 4 after
@@ -942,7 +963,8 @@ void InstructionSelector::VisitWord32Shr(Node* node) {
     uint32_t lsb = m.right().Value();
     Int32BinopMatcher mleft(m.left().node());
     if (mleft.right().HasValue()) {
-      uint32_t value = (mleft.right().Value() >> lsb) << lsb;
+      uint32_t value = static_cast<uint32_t>(mleft.right().Value() >> lsb)
+                       << lsb;
       uint32_t width = base::bits::CountPopulation(value);
       uint32_t msb = base::bits::CountLeadingZeros32(value);
       if ((width != 0) && (msb + width + lsb == 32)) {
@@ -1119,6 +1141,10 @@ void InstructionSelector::VisitWord32ReverseBytes(Node* node) {
   VisitRR(this, kArmRev, node);
 }
 
+void InstructionSelector::VisitSimd128ReverseBytes(Node* node) {
+  UNREACHABLE();
+}
+
 void InstructionSelector::VisitWord32Popcnt(Node* node) { UNREACHABLE(); }
 
 void InstructionSelector::VisitInt32Add(Node* node) {
@@ -2513,6 +2539,14 @@ SIMD_BINOP_LIST(SIMD_VISIT_BINOP)
 #undef SIMD_VISIT_BINOP
 #undef SIMD_BINOP_LIST
 
+void InstructionSelector::VisitF32x4Sqrt(Node* node) {
+  ArmOperandGenerator g(this);
+  // Use fixed registers in the lower 8 Q-registers so we can directly access
+  // mapped registers S0-S31.
+  Emit(kArmF32x4Sqrt, g.DefineAsFixed(node, q0),
+       g.UseFixed(node->InputAt(0), q0));
+}
+
 void InstructionSelector::VisitF32x4Div(Node* node) {
   ArmOperandGenerator g(this);
   // Use fixed registers in the lower 8 Q-registers so we can directly access