diff --git a/docs/mold.1 b/docs/mold.1
index 5f833b78e6..10addaa8af 100644
--- a/docs/mold.1
+++ b/docs/mold.1
@@ -1198,6 +1198,8 @@ Mark DSO non-deletable at runtime.
 .It Fl z Cm nodlopen
 Mark DSO not available to
 .Xr dlopen 3 .
+This option makes it possible for the linker to optimize thread-local \
+variable accesses by rewriting instructions for some targets.
 .Pp
 .It Fl z Cm nodump
 Mark DSO not available to
diff --git a/elf/arch-i386.cc b/elf/arch-i386.cc
index 41d77a1291..cfb70f1d7d 100644
--- a/elf/arch-i386.cc
+++ b/elf/arch-i386.cc
@@ -140,6 +140,7 @@ void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
       0xcc,                   // (padding)
     };
     memcpy(buf, insn, sizeof(insn));
+    *(ul32 *)(buf + 5) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
     *(ul32 *)(buf + 11) = sym.get_gotplt_addr(ctx) - ctx.got->shdr.sh_addr;
   } else {
     static const u8 insn[] = {
@@ -149,10 +150,9 @@ void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
       0xcc,                   // (padding)
     };
     memcpy(buf, insn, sizeof(insn));
+    *(ul32 *)(buf + 5) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
     *(ul32 *)(buf + 11) = sym.get_gotplt_addr(ctx);
   }
-
-  *(ul32 *)(buf + 5) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
 }
 
 template <>
@@ -528,7 +528,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
           ty != R_386_GOT32 && ty != R_386_GOT32X)
         Fatal(ctx) << *this << ": TLS_GD reloc must be followed by PLT or GOT32";
 
-      if (relax_tlsgd(ctx, sym))
+      if (ctx.arg.relax && !ctx.arg.shared && !sym.is_imported)
         i++;
       else
         sym.flags.fetch_or(NEEDS_TLSGD, std::memory_order_relaxed);
@@ -542,7 +542,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
           ty != R_386_GOT32 && ty != R_386_GOT32X)
         Fatal(ctx) << *this << ": TLS_LDM reloc must be followed by PLT or GOT32";
 
-      if (relax_tlsld(ctx))
+      if (ctx.arg.relax && !ctx.arg.shared)
         i++;
       else
         ctx.needs_tlsld.store(true, std::memory_order_relaxed);
diff --git a/elf/arch-s390x.cc b/elf/arch-s390x.cc
index 3798d03ebe..84564305c1 100644
--- a/elf/arch-s390x.cc
+++ b/elf/arch-s390x.cc
@@ -456,12 +456,13 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
       break;
     case R_390_TLS_GD32:
     case R_390_TLS_GD64:
-      if (!relax_tlsgd(ctx, sym))
+      if (bool do_relax = ctx.arg.relax && !ctx.arg.shared && !sym.is_imported;
+          !do_relax)
         sym.flags.fetch_or(NEEDS_TLSGD, std::memory_order_relaxed);
       break;
     case R_390_TLS_LDM32:
     case R_390_TLS_LDM64:
-      if (!relax_tlsld(ctx))
+      if (bool do_relax = ctx.arg.relax && !ctx.arg.shared; !do_relax)
         ctx.needs_tlsld.store(true, std::memory_order_relaxed);
       break;
     case R_390_TLS_LE32:
diff --git a/elf/arch-x86-64.cc b/elf/arch-x86-64.cc
index db1fd7f128..dcd3a42626 100644
--- a/elf/arch-x86-64.cc
+++ b/elf/arch-x86-64.cc
@@ -234,8 +234,37 @@ static void relax_gd_to_le(u8 *loc, ElfRel<E> rel, u64 val) {
   }
 }
 
+static void relax_gd_to_ie(u8 *loc, ElfRel<E> rel, u64 val) {
+  switch (rel.r_type) {
+  case R_X86_64_PLT32:
+  case R_X86_64_PC32:
+  case R_X86_64_GOTPCREL:
+  case R_X86_64_GOTPCRELX: {
+    static const u8 insn[] = {
+      0x64, 0x48, 0x8b, 0x04, 0x25, 0, 0, 0, 0, // mov %fs:0, %rax
+      0x48, 0x03, 0x05, 0, 0, 0, 0,             // add foo@gottpoff(%rip), %rax
+    };
+    memcpy(loc - 4, insn, sizeof(insn));
+    *(ul32 *)(loc + 8) = val - 12;
+    break;
+  }
+  case R_X86_64_PLTOFF64: {
+    static const u8 insn[] = {
+      0x64, 0x48, 0x8b, 0x04, 0x25, 0, 0, 0, 0, // mov %fs:0, %rax
+      0x48, 0x03, 0x05, 0, 0, 0, 0,             // add foo@gottpoff(%rip), %rax
+      0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00,       // nop
+    };
+    memcpy(loc - 3, insn, sizeof(insn));
+    *(ul32 *)(loc + 9) = val - 13;
+    break;
+  }
+  default:
+    unreachable();
+  }
+}
+
 // Rewrite a function call to __tls_get_addr to a cheaper instruction
-// sequence. The difference from relax_ld_to_le is that we are
+// sequence. The difference from relax_gd_to_le is that we are
 // materializing a Dynamic Thread Pointer for the current ELF module
 // instead of an address for a particular thread-local variable.
 static void relax_ld_to_le(u8 *loc, ElfRel<E> rel, u64 val) {
@@ -416,6 +445,9 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     case R_X86_64_TLSGD:
       if (sym.has_tlsgd(ctx)) {
         write32s(sym.get_tlsgd_addr(ctx) + A - P);
+      } else if (sym.has_gottp(ctx)) {
+        relax_gd_to_ie(loc, rels[i + 1], sym.get_gottp_addr(ctx) - P);
+        i++;
       } else {
         relax_gd_to_le(loc, rels[i + 1], S - ctx.tp_addr);
         i++;
@@ -662,7 +694,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
       if (sym.is_imported)
         sym.flags.fetch_or(NEEDS_PLT, std::memory_order_relaxed);
       break;
-    case R_X86_64_TLSGD: {
+    case R_X86_64_TLSGD:
       if (rel.r_addend != -4)
         Fatal(ctx) << *this << ": bad r_addend for R_X86_64_TLSGD";
 
@@ -675,13 +707,17 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
           ty != R_X86_64_GOTPCRELX)
         Fatal(ctx) << *this << ": TLSGD reloc must be followed by PLT or GOTPCREL";
 
-      if (relax_tlsgd(ctx, sym))
+      if (ctx.arg.relax && !sym.is_imported && !ctx.arg.shared) {
         i++;
-      else
+      } else if (ctx.arg.relax && !sym.is_imported && ctx.arg.shared &&
+                 !ctx.arg.z_dlopen) {
+        sym.flags.fetch_or(NEEDS_GOTTP, std::memory_order_relaxed);
+        i++;
+      } else {
         sym.flags.fetch_or(NEEDS_TLSGD, std::memory_order_relaxed);
+      }
       break;
-    }
-    case R_X86_64_TLSLD: {
+    case R_X86_64_TLSLD:
       if (rel.r_addend != -4)
         Fatal(ctx) << *this << ": bad r_addend for R_X86_64_TLSLD";
 
@@ -694,12 +730,11 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
           ty != R_X86_64_GOTPCRELX)
         Fatal(ctx) << *this << ": TLSLD reloc must be followed by PLT or GOTPCREL";
 
-      if (relax_tlsld(ctx))
+      if (ctx.arg.relax && !ctx.arg.shared)
         i++;
       else
         ctx.needs_tlsld.store(true, std::memory_order_relaxed);
       break;
-    }
     case R_X86_64_GOTTPOFF: {
       if (rel.r_addend != -4)
         Fatal(ctx) << *this << ": bad r_addend for R_X86_64_GOTTPOFF";
diff --git a/elf/mold.h b/elf/mold.h
index 3b11c0e84c..d9ef3264df 100644
--- a/elf/mold.h
+++ b/elf/mold.h
@@ -2826,16 +2826,6 @@ inline bool is_c_identifier(std::string_view s) {
   return true;
 }
 
-template <typename E>
-inline bool relax_tlsgd(Context<E> &ctx, Symbol<E> &sym) {
-  return ctx.arg.relax && !ctx.arg.shared && !sym.is_imported;
-}
-
-template <typename E>
-inline bool relax_tlsld(Context<E> &ctx) {
-  return ctx.arg.relax && !ctx.arg.shared;
-}
-
 template <typename E>
 inline bool relax_tlsdesc(Context<E> &ctx, Symbol<E> &sym) {
   // TLSDESC relocs must be always relaxed for statically-linked
diff --git a/test/elf/tls-gd-to-ie.sh b/test/elf/tls-gd-to-ie.sh
new file mode 100755
index 0000000000..2851547ff4
--- /dev/null
+++ b/test/elf/tls-gd-to-ie.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $GCC -fPIC -c -o $t/a.o -xc -
+#include <stdio.h>
+
+__attribute__((tls_model("global-dynamic"))) static _Thread_local int x1 = 1;
+__attribute__((tls_model("global-dynamic"))) _Thread_local int x2 = 2;
+__attribute__((tls_model("global-dynamic"))) _Thread_local int x3;
+
+int foo() {
+  x3 = 3;
+
+  printf("%d %d %d\n", x1, x2, x3);
+  return 0;
+}
+EOF
+
+cat <<EOF | $CC -fPIC -c -o $t/b.o -xc -
+int foo();
+int main() { foo(); }
+EOF
+
+$CC -B. -shared -o $t/c.so $t/a.o
+$CC -B. -o $t/exe1 $t/b.o $t/c.so
+$QEMU $t/exe1 | grep -q '1 2 3'
+
+$CC -B. -shared -o $t/d.so $t/a.o -Wl,-no-relax
+$CC -B. -o $t/exe2 $t/b.o $t/d.so
+$QEMU $t/exe2 | grep -q '1 2 3'
+
+$CC -B. -shared -o $t/e.so $t/a.o -Wl,-z,nodlopen
+$CC -B. -o $t/exe3 $t/b.o $t/e.so
+$QEMU $t/exe3 | grep -q '1 2 3'
+
+$CC -B. -shared -o $t/f.so $t/a.o -Wl,-z,nodlopen -Wl,-no-relax
+$CC -B. -o $t/exe4 $t/b.o $t/f.so
+$QEMU $t/exe4 | grep -q '1 2 3'
diff --git a/test/elf/x86_64_ifunc-alias.sh b/test/elf/x86_64_ifunc-alias.sh
old mode 100644
new mode 100755
diff --git a/test/elf/x86_64_tls-gd-to-ie.sh b/test/elf/x86_64_tls-gd-to-ie.sh
new file mode 100755
index 0000000000..257d52bf7d
--- /dev/null
+++ b/test/elf/x86_64_tls-gd-to-ie.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $GCC -fPIC -c -o $t/a.o -xc - -mcmodel=large
+#include <stdio.h>
+
+__attribute__((tls_model("global-dynamic"))) static _Thread_local int x1 = 1;
+__attribute__((tls_model("global-dynamic"))) _Thread_local int x2 = 2;
+__attribute__((tls_model("global-dynamic"))) _Thread_local int x3;
+
+int foo() {
+  x3 = 3;
+
+  printf("%d %d %d\n", x1, x2, x3);
+  return 0;
+}
+EOF
+
+cat <<EOF | $CC -fPIC -c -o $t/b.o -xc -
+int foo();
+int main() { foo(); }
+EOF
+
+$CC -B. -shared -o $t/c.so $t/a.o
+$CC -B. -o $t/exe1 $t/b.o $t/c.so
+$QEMU $t/exe1 | grep -q '1 2 3'
+
+$CC -B. -shared -o $t/d.so $t/a.o -Wl,-no-relax
+$CC -B. -o $t/exe2 $t/b.o $t/d.so
+$QEMU $t/exe2 | grep -q '1 2 3'
+
+$CC -B. -shared -o $t/e.so $t/a.o -Wl,-z,nodlopen
+$CC -B. -o $t/exe3 $t/b.o $t/e.so
+$QEMU $t/exe3 | grep -q '1 2 3'
+
+$CC -B. -shared -o $t/f.so $t/a.o -Wl,-z,nodlopen -Wl,-no-relax
+$CC -B. -o $t/exe4 $t/b.o $t/f.so
+$QEMU $t/exe4 | grep -q '1 2 3'