Skip to content

Commit

Permalink
[GC] Add option G1BarrierSimple to use simple g1 post barrier
Browse files Browse the repository at this point in the history
Summary: Provide option G1BarrierSimple to use simple G1 post barrier for better mutator performance

Testing: CI pipeline

Reviewers: yude, yifeng

Issue: dragonwell-project#828
  • Loading branch information
mmyxym committed May 31, 2024
1 parent ad233ed commit 74f3822
Show file tree
Hide file tree
Showing 23 changed files with 551 additions and 21 deletions.
39 changes: 39 additions & 0 deletions src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,29 @@ void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm

void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators,
Register start, Register count, Register scratch, RegSet saved_regs) {
if (G1BarrierSimple) {

Label L_loop, L_done;
const Register end = count;

__ cbz(count, L_done); // zero count - nothing to do

__ lea(end, Address(start, count, Address::lsl(LogBytesPerHeapOop))); // end = start + count << LogBytesPerHeapOop
__ sub(end, end, BytesPerHeapOop); // last element address to make inclusive
__ lsr(start, start, CardTable::card_shift);
__ lsr(end, end, CardTable::card_shift);
__ sub(count, end, start); // number of bytes to copy

__ load_byte_map_base(scratch);
__ add(start, start, scratch);
__ bind(L_loop);
__ strb(zr, Address(start, count));
__ subs(count, count, 1);
__ br(Assembler::GE, L_loop);
__ bind(L_done);
return;
}

__ push(saved_regs, sp);
assert_different_registers(start, count, scratch);
assert_different_registers(c_rarg0, count);
Expand Down Expand Up @@ -209,6 +232,15 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
Label done;
Label runtime;

if (G1BarrierSimple) {
const Register card_addr = tmp;
__ lsr(card_addr, store_addr, CardTable::card_shift);
__ load_byte_map_base(tmp2);
__ add(card_addr, card_addr, tmp2);
__ strb(zr, Address(card_addr));
__ bind(done);
return;
}
// Does store cross heap regions?

__ eor(tmp, store_addr, new_val);
Expand Down Expand Up @@ -449,6 +481,13 @@ void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler*
__ load_parameter(0, card_offset);
__ lsr(card_offset, card_offset, CardTable::card_shift);
__ load_byte_map_base(byte_map_base);

if (G1BarrierSimple) {
__ strb(zr, Address(byte_map_base, card_offset));
__ bind(done);
__ epilogue();
return;
}
__ ldrb(rscratch1, Address(byte_map_base, card_offset));
__ cmpw(rscratch1, (int)G1CardTable::g1_young_card_val());
__ br(Assembler::EQ, done);
Expand Down
48 changes: 48 additions & 0 deletions src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@

#define __ masm->

#define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)

void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm, DecoratorSet decorators,
Register addr, Register count) {
bool dest_uninitialized = (decorators & IS_DEST_UNINITIALIZED) != 0;
Expand Down Expand Up @@ -97,6 +99,35 @@ void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm

void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators,
Register addr, Register count, Register tmp) {
if (G1BarrierSimple) {
CardTableBarrierSet* ct =
barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
intptr_t disp = (intptr_t) ct->card_table()->byte_map_base();

Label L_loop, L_done;
const Register end = count;
assert_different_registers(addr, end);

__ testl(count, count);
__ jcc(Assembler::zero, L_done); // zero count - nothing to do

__ leaq(end, Address(addr, count, TIMES_OOP, 0)); // end == addr+count*oop_size
__ subptr(end, BytesPerHeapOop); // end - 1 to make inclusive
__ shrptr(addr, CardTable::card_shift);
__ shrptr(end, CardTable::card_shift);
__ subptr(end, addr); // end --> cards count

__ mov64(tmp, disp);
__ addptr(addr, tmp);
__ bind(L_loop);
__ movb(Address(addr, count, Address::times_1), 0);
__ decrement(count);
__ jcc(Assembler::greaterEqual, L_loop);
__ bind(L_done);

return;
}

__ pusha(); // push registers (overkill)
#ifdef _LP64
if (c_rarg0 == count) { // On win64 c_rarg0 == rcx
Expand Down Expand Up @@ -302,6 +333,12 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
__ movptr(cardtable, (intptr_t)ct->card_table()->byte_map_base());
__ addptr(card_addr, cardtable);

if (G1BarrierSimple) {
__ movb(Address(card_addr, 0), (int)G1CardTable::dirty_card_val());
__ bind(done);
return;
}

__ cmpb(Address(card_addr, 0), (int)G1CardTable::g1_young_card_val());
__ jcc(Assembler::equal, done);

Expand Down Expand Up @@ -551,6 +588,17 @@ void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler*

NOT_LP64(__ get_thread(thread);)

if (G1BarrierSimple) {
__ movb(Address(card_addr, 0), (int)G1CardTable::dirty_card_val());
__ bind(done);
__ pop(rcx);
__ pop(rax);

__ epilogue();

return;
}

__ cmpb(Address(card_addr, 0), (int)G1CardTable::g1_young_card_val());
__ jcc(Assembler::equal, done);

Expand Down
40 changes: 40 additions & 0 deletions src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,13 @@ void G1BarrierSetC2::post_barrier(GraphKit* kit,
// Combine card table base and card offset
Node* card_adr = __ AddP(no_base, byte_map_base_node(kit), card_offset );

if (G1BarrierSimple) {
__ store(__ ctrl(), card_adr, dirty_card, T_BYTE, Compile::AliasIdxRaw, MemNode::unordered);
// Final sync IdealKit and GraphKit.
kit->final_sync(ideal);
return;
}

// If we know the value being stored does it cross regions?

if (val != NULL) {
Expand Down Expand Up @@ -658,6 +665,39 @@ bool G1BarrierSetC2::is_gc_barrier_node(Node* node) const {

void G1BarrierSetC2::eliminate_gc_barrier(PhaseMacroExpand* macro, Node* node) const {
assert(node->Opcode() == Op_CastP2X, "ConvP2XNode required");

if (G1BarrierSimple) {
Node* this_region = node->in(0);
// Search "if (marking != 0)" check and set it to "false".
// There is no G1 pre barrier if previous stored value is NULL
// (for example, after initialization).
if (this_region->is_Region() && this_region->req() == 3) {
int ind = 1;
if (!this_region->in(ind)->is_IfFalse()) {
ind = 2;
}
if (this_region->in(ind)->is_IfFalse() &&
this_region->in(ind)->in(0)->Opcode() == Op_If) {
Node* bol = this_region->in(ind)->in(0)->in(1);
assert(bol->is_Bool(), "");
Node* cmpx = bol->in(1);
if (bol->as_Bool()->_test._test == BoolTest::ne &&
cmpx->is_Cmp() && cmpx->in(2) == macro->intcon(0) &&
cmpx->in(1)->is_Load()) {
Node* adr = cmpx->in(1)->as_Load()->in(MemNode::Address);
const int marking_offset = in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset());
if (adr->is_AddP() && adr->in(AddPNode::Base) == macro->top() &&
adr->in(AddPNode::Address)->Opcode() == Op_ThreadLocal &&
adr->in(AddPNode::Offset) == macro->MakeConX(marking_offset)) {
macro->replace_node(cmpx, macro->makecon(TypeInt::CC_EQ));
}
}
}
}
CardTableBarrierSetC2::eliminate_gc_barrier(macro, node);
return;
}

assert(node->outcnt() <= 2, "expects 1 or 2 users: Xor and URShift nodes");
// It could be only one user, URShift node, in Object.clone() intrinsic
// but the new allocation is passed to arraycopy stub and it could not
Expand Down
11 changes: 10 additions & 1 deletion src/hotspot/share/gc/g1/g1Arguments.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,16 @@ void G1Arguments::initialize() {
vm_exit_during_initialization("The flag -XX:+UseG1GC can not be combined with -XX:ParallelGCThreads=0", NULL);
}

if (FLAG_IS_DEFAULT(G1ConcRefinementThreads)) {
if (G1BarrierSimple) {
#if !defined(_LP64) || !(defined(X86) || defined(AARCH64))
warning("G1BarrierSimple is not supported with current platform"
"; ignoring G1BarrierSimple flag.");
FLAG_SET_DEFAULT(G1BarrierSimple, false);
#else
FLAG_SET_DEFAULT(G1ConcRefinementThreads, 0);
FLAG_SET_DEFAULT(G1ConcRSLogCacheSize, 0);
#endif
} else if (FLAG_IS_DEFAULT(G1ConcRefinementThreads)) {
FLAG_SET_ERGO(uint, G1ConcRefinementThreads, ParallelGCThreads);
}

Expand Down
5 changes: 5 additions & 0 deletions src/hotspot/share/gc/g1/g1BarrierSet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ void G1BarrierSet::write_ref_array_pre(narrowOop* dst, size_t count, bool dest_u
}

void G1BarrierSet::write_ref_field_post_slow(volatile jbyte* byte) {
assert(!G1BarrierSimple, "sanity");
// In the slow path, we know a card is not young
assert(*byte != G1CardTable::g1_young_card_val(), "slow path invoked without filtering");
OrderAccess::storeload();
Expand All @@ -120,6 +121,10 @@ void G1BarrierSet::invalidate(MemRegion mr) {
volatile jbyte* byte = _card_table->byte_for(mr.start());
jbyte* last_byte = _card_table->byte_for(mr.last());
Thread* thr = Thread::current();
if (G1BarrierSimple) {
memset((void*)byte, G1CardTable::dirty_card_val(), last_byte - byte + 1);
return;
}
// skip all consecutive young cards
for (; byte <= last_byte && *byte == G1CardTable::g1_young_card_val(); byte++);

Expand Down
4 changes: 4 additions & 0 deletions src/hotspot/share/gc/g1/g1BarrierSet.inline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ inline void G1BarrierSet::write_ref_field_pre(T* field) {
template <DecoratorSet decorators, typename T>
inline void G1BarrierSet::write_ref_field_post(T* field, oop new_val) {
volatile jbyte* byte = _card_table->byte_for(field);
if (G1BarrierSimple) {
*byte = G1CardTable::dirty_card_val();
return;
}
if (*byte != G1CardTable::g1_young_card_val()) {
// Take a slow path for cards in old
write_ref_field_post_slow(byte);
Expand Down
9 changes: 8 additions & 1 deletion src/hotspot/share/gc/g1/g1CardTable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ bool G1CardTable::mark_card_deferred(size_t card_index) {
}

void G1CardTable::g1_mark_as_young(const MemRegion& mr) {
assert(!G1BarrierSimple, "Should not be called with G1BarrierSimple");

jbyte *const first = byte_for(mr.start());
jbyte *const last = byte_after(mr.last());

Expand All @@ -61,7 +63,9 @@ void G1CardTable::g1_mark_as_young(const MemRegion& mr) {

#ifndef PRODUCT
void G1CardTable::verify_g1_young_region(MemRegion mr) {
verify_region(mr, g1_young_gen, true);
if (!G1BarrierSimple) {
verify_region(mr, g1_young_gen, true);
}
}
#endif

Expand Down Expand Up @@ -97,6 +101,9 @@ void G1CardTable::initialize(G1RegionToSpaceMapper* mapper) {
}

bool G1CardTable::is_in_young(oop obj) const {
if (G1BarrierSimple) {
return G1CollectedHeap::heap()->heap_region_containing(obj)->is_young();
}
volatile jbyte* p = byte_for(obj);
return *p == G1CardTable::g1_young_card_val();
}
24 changes: 19 additions & 5 deletions src/hotspot/share/gc/g1/g1CollectedHeap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2181,6 +2181,12 @@ void G1CollectedHeap::heap_region_par_iterate_from_start(HeapRegionClosure* cl,
_hrm.par_iterate(cl, hrclaimer, 0);
}

void G1CollectedHeap::heap_region_par_iterate_chunk_based(HeapRegionClosure* cl,
HeapRegionChunkClaimer* chunk_claimer,
uint worker_id) const {
_hrm.par_iterate(cl, chunk_claimer, chunk_claimer->offset_for_worker(worker_id));
}

void G1CollectedHeap::collection_set_iterate(HeapRegionClosure* cl) {
_collection_set.iterate(cl);
}
Expand Down Expand Up @@ -2671,7 +2677,9 @@ class RegisterHumongousWithInCSetFastTestClosure : public HeapRegionClosure {
if (g1h->is_in_closed_subset(ct->addr_for(card_ptr))) {
if (*card_ptr != G1CardTable::dirty_card_val()) {
*card_ptr = G1CardTable::dirty_card_val();
_dcq.enqueue(card_ptr);
if (!G1BarrierSimple) {
_dcq.enqueue(card_ptr);
}
}
}
}
Expand Down Expand Up @@ -3188,6 +3196,7 @@ class G1ParTask : public AbstractGangTask {
G1RootProcessor* _root_processor;
TaskTerminator _terminator;
uint _n_workers;
HeapRegionChunkClaimer _chunk_claimer;

public:
G1ParTask(G1CollectedHeap* g1h, G1ParScanThreadStateSet* per_thread_states, RefToScanQueueSet *task_queues, G1RootProcessor* root_processor, uint n_workers)
Expand All @@ -3197,7 +3206,8 @@ class G1ParTask : public AbstractGangTask {
_queues(task_queues),
_root_processor(root_processor),
_terminator(n_workers, _queues),
_n_workers(n_workers)
_n_workers(n_workers),
_chunk_claimer(n_workers)
{}

void work(uint worker_id) {
Expand All @@ -3223,7 +3233,7 @@ class G1ParTask : public AbstractGangTask {
// treating the nmethods visited to act as roots for concurrent marking.
// We only want to make sure that the oops in the nmethods are adjusted with regard to the
// objects copied by the current evacuation.
_g1h->g1_rem_set()->oops_into_collection_set_do(pss, worker_id);
_g1h->g1_rem_set()->oops_into_collection_set_do(pss, worker_id, &_chunk_claimer);

double strong_roots_sec = os::elapsedTime() - start_strong_roots_sec;

Expand Down Expand Up @@ -3718,8 +3728,12 @@ void G1CollectedHeap::redirty_logged_cards() {
dirty_card_queue_set().reset_for_par_iteration();
workers()->run_task(&redirty_task);

DirtyCardQueueSet& dcq = G1BarrierSet::dirty_card_queue_set();
dcq.merge_bufferlists(&dirty_card_queue_set());
if (G1BarrierSimple) {
dirty_card_queue_set().clear();
} else {
DirtyCardQueueSet& dcq = G1BarrierSet::dirty_card_queue_set();
dcq.merge_bufferlists(&dirty_card_queue_set());
}
assert(dirty_card_queue_set().completed_buffers_num() == 0, "All should be consumed");

g1_policy()->phase_times()->record_redirty_logged_cards_time_ms((os::elapsedTime() - redirty_logged_cards_start) * 1000.0);
Expand Down
5 changes: 5 additions & 0 deletions src/hotspot/share/gc/g1/g1CollectedHeap.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ class G1CollectedHeap : public CollectedHeap {

// Other related classes.
friend class HeapRegionClaimer;
friend class HeapRegionChunkClaimer;

// Testing classes.
friend class G1CheckCSetFastTableClosure;
Expand Down Expand Up @@ -1167,6 +1168,10 @@ class G1CollectedHeap : public CollectedHeap {
void heap_region_par_iterate_from_start(HeapRegionClosure* cl,
HeapRegionClaimer* hrclaimer) const;

void heap_region_par_iterate_chunk_based(HeapRegionClosure* cl,
HeapRegionChunkClaimer* chunk_claimer,
uint worker_id) const;

// Iterate over the regions (if any) in the current collection set.
void collection_set_iterate(HeapRegionClosure* blk);

Expand Down
5 changes: 5 additions & 0 deletions src/hotspot/share/gc/g1/g1CollectedHeap.inline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,11 @@ inline void
G1CollectedHeap::dirty_young_block(HeapWord* start, size_t word_size) {
assert_heap_not_locked();

if (G1BarrierSimple) {
// Not necessary to mark card young
return;
}

// Assign the containing region to containing_hr so that we don't
// have to keep calling heap_region_containing() in the
// asserts below.
Expand Down
1 change: 1 addition & 0 deletions src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ void G1ConcurrentRefineThread::deactivate() {
}

void G1ConcurrentRefineThread::run_service() {
assert(!G1BarrierSimple, "Concurrent Refinement should be disabled");
_vtime_start = os::elapsedVTime();

while (!should_terminate()) {
Expand Down
Loading

0 comments on commit 74f3822

Please sign in to comment.