﻿
#pragma comment(lib, "LLVM-21.lib")

#include "TeInlinerInterface.h"

#include "llvm/IR/PassManager.h"
#include "llvm/Passes/PassBuilder.h"
#include "llvm/Passes/PassPlugin.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include "llvm/Transforms/Utils/CallPromotionUtils.h" 
#include "llvm/Support/raw_ostream.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/Support/FormatVariadic.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Transforms/Utils/Instrumentation.h"
#include "llvm/Transforms/Utils/CodeLayout.h"
#include "llvm/Transforms/IPO/HotColdSplitting.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/Process.h"
#include "llvm/Support/Threading.h"

// 最適化パスのヘッダ
#include "llvm/Transforms/Scalar/SROA.h"
#include "llvm/Transforms/Scalar/EarlyCSE.h"
#include "llvm/Transforms/InstCombine/InstCombine.h"
#include "llvm/Transforms/Scalar/SimplifyCFG.h"
#include "llvm/Transforms/Scalar/ADCE.h"
#include "llvm/Transforms/Scalar/SCCP.h"
#include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
#include "llvm/Transforms/IPO/GlobalDCE.h"
#include "llvm/Transforms/IPO/DeadArgumentElimination.h"
#include "llvm/ProfileData/InstrProf.h"
#include "llvm/Transforms/IPO/MergeFunctions.h"
#include <iterator>
#include <limits>

#define NOMINMAX
#include <windows.h>

using namespace llvm;

// ログを出力したい場合は1。通常ビルド時は0。
#define TE_ENABLE_LOGGING 0
#define TE_ENABLE_MISSED_OPPORTUNITY_LOGGING 0
#define TE_ENABLE_AFFINITY_ORDER_LOGGING 0
#define TE_ENABLE_SORT_LOGGING 0

namespace {

// ポインタキャストを剥がして直接の呼び出し先を取得
static Function* getDirectCallee(CallBase* CB) {
  if (!CB)
    return nullptr;

  if (Function* F = CB->getCalledFunction())
    return F;

  Value* Called = CB->getCalledOperand();
  if (!Called)
    return nullptr;

  Value* Stripped = Called->stripPointerCasts();
  return dyn_cast<Function>(Stripped);
}

// ============================================================================
// TTI を使った関数サイズ計測
// ============================================================================
static bool computeFunctionTotalSize(const Function& F,
                                     const TargetTransformInfo& TTI,
                                     int64_t& OutSize) {
  int64_t Sum = 0;
  const TargetTransformInfo::TargetCostKind CostKind =
    TargetTransformInfo::TCK_CodeSize;

  for (const Instruction& I : instructions(F)) {
    InstructionCost IC = TTI.getInstructionCost(&I, CostKind);

    if (!IC.isValid())
      return false;

    int64_t Val = IC.getValue();
    if (Val < 0) Val = 0;

    if (Sum > INT64_MAX - Val) return false;  // Overflow check
    Sum += Val;
  }
  OutSize = Sum;
  return true;
}

// ============================================================================
// ヘルパー: インライン展開を実行し、直後にクリーンアップ最適化
// ============================================================================
static bool applyInlineAndOptimize(Function& F, CallBase* CB,
                                   FunctionAnalysisManager& FAM,
                                   bool IsSimulation) {
  if (!CB->getParent()) {
    return false;
  }

  InlineFunctionInfo IFI;
  InlineResult IR = InlineFunction(*CB, IFI, /*MergeAttributes=*/true);

  if (!IR.isSuccess()) {
    return false;
  }

  // 解析情報の無効化
  FAM.invalidate(F, PreservedAnalyses::none());

  if (IsSimulation) {
    // クリーンアップ最適化
    FunctionPassManager FPM;
    for (const Instruction& I : instructions(F)) {
      if (isa<AllocaInst>(&I)) {
        FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
        break;
      }
    }
    FPM.addPass(EarlyCSEPass());
    FPM.addPass(SimplifyCFGPass());
    FPM.addPass(InstCombinePass());
    FPM.addPass(ADCEPass());
    FPM.run(F, FAM);
  }

  return true;
}

// ============================================================================
// クローン -> インライン -> 最適化 -> 計測
// ============================================================================
static int64_t measureSizeAfterInliningAndOptimization(Function& OriginalF,
                                                       CallBase* CB,
                                                       bool& Success) {
  Success = false;
  if (!CB) return INT64_MAX;

  // 関数のクローン作成
  ValueToValueMapTy VMap;
  Function* ClonedF = CloneFunction(&OriginalF, VMap);
  if (!ClonedF) return INT64_MAX;

  // クローンされた関数内での CallBase を特定
  CallBase* ClonedCB = dyn_cast_or_null<CallBase>(VMap[CB]);
  if (!ClonedCB) {
    ClonedF->eraseFromParent();
    return INT64_MAX;
  }

  // ローカル最適化の実行準備
  LoopAnalysisManager LAM;
  FunctionAnalysisManager ClonedFAM;
  CGSCCAnalysisManager CGAM;
  ModuleAnalysisManager MAM;

  PassBuilder PB;
  PB.registerModuleAnalyses(MAM);
  PB.registerCGSCCAnalyses(CGAM);
  PB.registerFunctionAnalyses(ClonedFAM);
  PB.registerLoopAnalyses(LAM);
  PB.crossRegisterProxies(LAM, ClonedFAM, CGAM, MAM);

  // インライン展開と最適化
  if (!applyInlineAndOptimize(*ClonedF, ClonedCB, ClonedFAM, true)) {
    ClonedF->eraseFromParent();
    return INT64_MAX;
  }

  // サイズ計測
  const TargetTransformInfo& ClonedTTI =
      ClonedFAM.getResult<TargetIRAnalysis>(*ClonedF);
  int64_t NewSize = 0;
  if (!computeFunctionTotalSize(*ClonedF, ClonedTTI, NewSize)) {
    NewSize = static_cast<int64_t>(std::distance(instructions(*ClonedF).begin(),
                                                 instructions(*ClonedF).end()));
  }

  ClonedF->eraseFromParent();
  Success = true;
  return NewSize;
}

// ============================================================================
// メインパス
// ============================================================================
struct TeInlinerPass : public PassInfoMixin<TeInlinerPass> {
  // ユーザー定義の閾値設定
  const uint64_t ExtendedCalleeSizeLimit = 500;     // Calleeの最大サイズ
  const uint64_t MaxCallerSizeForHot = 3000;        // Hot: キャッシュ溢れを防ぐため厳しく
  const uint64_t MaxCallerSizeForWarm = 10000;      // Warm: 従来の制限値を維持
  const uint64_t MaxCallerSizeForCold = UINT64_MAX; // Cold: ゴミ捨て場として大幅緩和
  const int MaxInlineIterations = 200;              // 1関数あたりの最大インライン回数
  const int64_t MaxColdCalleeSizeInHotCaller = 25;  // Hot CallerにCold Callerからのインライン対象を見つけた場合、Cold Callerのサイズがこれより大きい場合は処理を打ち切る

  struct CandidateInfo {
    WeakTrackingVH Handle;
  };

  PreservedAnalyses run(Function& F, FunctionAnalysisManager& FAM) {
    if (F.isDeclaration())
      return PreservedAnalyses::all();

    // 解析情報の取得
    auto& CallerTTI = FAM.getResult<TargetIRAnalysis>(F);

    bool Changed = false;
    bool MadeChange;
    int IterationCount = 0;

    do {
      MadeChange = false;
      IterationCount++;

      // 安全装置: 無限ループ防止
      if (IterationCount > MaxInlineIterations) {
        break;
      }

      uint64_t CallerInstCount = F.getInstructionCount(); // Callerの命令数

      ProfileSummaryInfo* PSI = nullptr;
      BlockFrequencyInfo* BFI = nullptr;

      auto& MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
      PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
      BFI = &FAM.getResult<BlockFrequencyAnalysis>(F);

      // Caller の熱さ判定
      bool IsCallerHotFunc = false;
      bool IsCallerColdFunc = false;

      if (PSI && PSI->hasProfileSummary()) {
        if (BFI) {
          IsCallerHotFunc = PSI->isFunctionHotInCallGraph(&F, *BFI);
          IsCallerColdFunc = PSI->isFunctionColdInCallGraph(&F, *BFI);
        } else {
          IsCallerHotFunc = PSI->isFunctionEntryHot(&F);
          IsCallerColdFunc = F.hasFnAttribute(Attribute::Cold) || PSI->isFunctionEntryCold(&F);
        }
      } else {
        IsCallerColdFunc = F.hasFnAttribute(Attribute::Cold);
      }
      bool IsCallerWarmFunc = !IsCallerHotFunc && !IsCallerColdFunc;

      // Callerのサイズ制限
      uint64_t CurrentCallerSizeLimit = MaxCallerSizeForWarm;
      if (IsCallerHotFunc) {
        CurrentCallerSizeLimit = MaxCallerSizeForHot;
      } else if (IsCallerColdFunc) {
        CurrentCallerSizeLimit = MaxCallerSizeForCold;
      }

      // Callerが巨大すぎる場合は処理を打ち切る
      if (CallerInstCount > CurrentCallerSizeLimit) {
        break;
      }
      
      // 現在の関数サイズ(TTIコスト)を計測
      int64_t CurrentSize = 0;
      if (!computeFunctionTotalSize(F, CallerTTI, CurrentSize)) {
        break;
      }

      CallBase* BestCandidate = nullptr;
      int64_t BestScore = std::numeric_limits<int64_t>::min();
      SmallVector<CandidateInfo, 64> NormalCandidates;
      SmallVector<CandidateInfo, 16> PriorityCandidates;

      for (Instruction& I : instructions(F)) {
        if (CallBase* CB = dyn_cast<CallBase>(&I)) {
          // 親ブロックがない（死んでいる）命令はスキップ
          if (!CB->getParent()) continue;

          Function* Callee = getDirectCallee(CB);

          // 基本的な除外条件
          if (!Callee || Callee->isDeclaration() || Callee->isIntrinsic() ||
              Callee == &F)
            continue;

          // 【重要】Windows x64 LLVM21 バグ回避
          // 可変長引数関数はインライン展開すると不正なコードが生成される場合があるため除外
          if (Callee->isVarArg()) continue;

          // 属性による除外
          if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
              Callee->hasFnAttribute(Attribute::NoInline))
            continue;

          // CallerとCallee のターゲット機能に互換性があるか
          if (!CallerTTI.areInlineCompatible(&F, Callee)) continue;

          // Solitary判定
          bool IsSolitary = false;
          if (!Callee->hasAvailableExternallyLinkage() &&
              Callee->hasLocalLinkage() && Callee->hasOneUse()) {
            IsSolitary = true;
          }

          // Solitaryでなければスキップ
          if (!IsSolitary) {
            continue;
          }

          const uint64_t CalleeSize = Callee->getInstructionCount();

          // 動的サイズ制限 & I-Cache保護
          const uint64_t LimitForHot = 100;
          const uint64_t LimitForWarm = 225;
          const uint64_t LimitForCold = 400;  // 実行回数が少ない
          const uint64_t LimitForZero = ExtendedCalleeSizeLimit;  // 実行回数ゼロ
          const uint64_t LimitForHotPair = 250;      // Hot -> Hot
          const uint64_t LimitForWarmPair = 350;     // Warm -> Warm
          uint64_t CurrentSizeLimit = LimitForWarm;  // デフォルト

          bool IsCallSiteCold = false;
          bool IsCalleeCold = false;

          if (PSI && PSI->hasProfileSummary()) {
            bool IsCalleeHot = PSI->isFunctionEntryHot(Callee);
            if (IsCalleeHot) {
              IsCalleeCold = false;
            } else {
              IsCalleeCold = Callee->hasFnAttribute(Attribute::Cold) ||
                             PSI->isFunctionEntryCold(Callee);
            }
            bool IsCalleeWarm = !IsCalleeHot && !IsCalleeCold;

            if (BFI) {
              BasicBlock* BB = CB->getParent();

              // 1. 実行回数ゼロ (Dead Code) -> 最も緩い制限 (ゴミ捨て場)
              // isColdBlockよりも強い条件として、明示的にカウント0をチェック
              auto OptCount = BFI->getBlockProfileCount(BB);
              if (OptCount.has_value() && OptCount.value() == 0) {
                CurrentSizeLimit = IsCallerHotFunc ? LimitForCold : LimitForZero;
                IsCallSiteCold = true;
              }
              // 2. Hot Block -> 厳しい制限 (I-Cache保護)
              else if (PSI->isHotBlock(BB, BFI)) {
                CurrentSizeLimit = LimitForHot;
                IsCallSiteCold = false;
              }
              // 3. Cold Block -> 緩い制限
              else if (PSI->isColdBlock(BB, BFI)) {
                CurrentSizeLimit = IsCallerHotFunc ? LimitForWarm : LimitForCold;
                IsCallSiteCold = true;
              }
              // 4. Warm Block (その他) -> 標準制限
              else {
                CurrentSizeLimit = LimitForWarm;
                IsCallSiteCold = false;
              }
            } else {
              // BFIなしフォールバック
              if (IsCallerHotFunc)
                CurrentSizeLimit = LimitForHot;
              else if (IsCallerColdFunc)
                CurrentSizeLimit = LimitForZero;
              IsCallSiteCold = (CurrentSizeLimit >= LimitForCold);
            }

            // 優先インライン展開 (Priority)
            // 条件に合致すれば、シミュレーションなしで即決(Priority)リストへ
            // ここでcontinueすることで、後の厳しいチェックをスキップ

            // 1. Cold同士
            if (IsCallerColdFunc && IsCalleeCold) {
              PriorityCandidates.push_back({WeakTrackingVH(CB)});
              continue;
            }
            // 2. Hot同士
            if (IsCallerHotFunc && IsCalleeHot &&
                !IsCallSiteCold &&
                CalleeSize <= LimitForHotPair) {
              PriorityCandidates.push_back({WeakTrackingVH(CB)});
              continue;
            }
            // 3. Warm同士
            if (IsCallerWarmFunc && IsCalleeWarm &&
                CalleeSize <= LimitForWarmPair) {
              PriorityCandidates.push_back({WeakTrackingVH(CB)});
              continue;
            }
          } else {
            // PSIがない場合のフォールバック
            IsCalleeCold = Callee->hasFnAttribute(Attribute::Cold);
          }

          // 1. 動的リミットによる足切り
          if (CalleeSize > CurrentSizeLimit) {
            if (TE_ENABLE_LOGGING) {
              errs() << "[TeInliner] Skip " << Callee->getName()
                     << ": Too large for context (" << CalleeSize << " > "
                     << CurrentSizeLimit << ")\n";
            }
            continue;
          }

          // 2. I-Cache汚染防止 (Hot Callerへの巨大Cold混入防止)
          if (IsCallerHotFunc) {
            if (IsCalleeCold && !IsCallSiteCold) {
              if (CalleeSize > MaxColdCalleeSizeInHotCaller) {
                if (TE_ENABLE_LOGGING) {
                  errs() << "[TeInliner] Skip: Large Cold (" << CalleeSize
                         << ") into Hot Caller\n";
                }
                continue;
              }
            }
          }

          NormalCandidates.push_back({WeakTrackingVH(CB)});
        }
      }

      // 削減効果が高そうな順にソート
      std::stable_sort(
          NormalCandidates.begin(), NormalCandidates.end(),
          [](const CandidateInfo& A, const CandidateInfo& B) {
            if (!A.Handle || !B.Handle) return false;

            CallBase* CBA = dyn_cast_or_null<CallBase>(A.Handle);
            CallBase* CBB = dyn_cast_or_null<CallBase>(B.Handle);
            if (!CBA || !CBB) return false;

            // ヘルパー: 定数引数の数を数える
            auto CountConstArgs = [](CallBase* CB) -> int {
              return std::count_if(CB->arg_begin(), CB->arg_end(),
                                   [](Value* V) { return isa<Constant>(V); });
            };

            // 基準1: 定数引数の数が多い方を優先 (定数畳み込み期待)
            int ConstsA = CountConstArgs(CBA);
            int ConstsB = CountConstArgs(CBB);
            if (ConstsA != ConstsB) {
              return ConstsA > ConstsB;  // 多いほうが先
            }

            // 基準2: 引数の総数が多い方を優先 (呼び出しオーバーヘッド削減期待)
            // 引数が多いほど、Call命令前の mov/push 命令が多く削減できる
            if (CBA->arg_size() != CBB->arg_size()) {
              return CBA->arg_size() > CBB->arg_size();  // 多いほうが先
            }

            // 基準3: Calleeのサイズが小さい方を優先 (処理コスト安)
            return getDirectCallee(CBA)->getInstructionCount() <
                   getDirectCallee(CBB)->getInstructionCount();
          });


      // 優先的にインライン展開
      for (const auto& Info : PriorityCandidates) {
        if (!Info.Handle)
          continue;

        CallBase* CB = dyn_cast_or_null<CallBase>(Info.Handle);
        if (!CB)
          continue;

        Function* Callee = getDirectCallee(CB);
        std::string CalleeName = Callee->getName().str();
        std::string CallerName = F.getName().str();

        // シミュレーションなしで強制的にインライン展開
        if (applyInlineAndOptimize(F, CB, FAM, false)) {
          if (TE_ENABLE_LOGGING) {
            errs() << formatv("[TeInliner] Priority Inlined {0} into {1}\n",
                              CalleeName, CallerName);
          }

          // ゾンビ関数の即時破壊
          // ネストした関数の連鎖的なインライン化（A->B->C）を促進
          if (!Callee->isDeclaration()) {
            Callee->deleteBody();
          }

          MadeChange = true;
          Changed = true;
        }
      }


      // 通常候補の処理 (シミュレーションあり)
      // 試行回数の動的制限 (Callerが大きくなるほど試行回数を減らす)
      uint64_t CallerSize = F.getInstructionCount();
      size_t MaxCandidatesToTry = 50000 / (CallerSize + 1000);
      MaxCandidatesToTry = std::clamp<size_t>(MaxCandidatesToTry, 5, 40);
      if (NormalCandidates.size() > MaxCandidatesToTry) {
        NormalCandidates.resize(MaxCandidatesToTry);
      }

      for (const auto& Info : NormalCandidates) {
        if (!Info.Handle)
          continue;

        CallBase* CB = dyn_cast_or_null<CallBase>(Info.Handle);
        if (!CB)
          continue;

        const int64_t AcceptanceThreshold = -static_cast<int64_t>(ExtendedCalleeSizeLimit);

        // シミュレーション実行
        bool Success = false;
        int64_t EstimatedNewSize = measureSizeAfterInliningAndOptimization(F, CB, Success);
        if (!Success)
          continue;

        int64_t SizeSaving = CurrentSize - EstimatedNewSize;
        if (TE_ENABLE_LOGGING) {
          errs() << "[TeInliner] Eval " << getDirectCallee(CB)->getName()
                 << ": Saving=" << SizeSaving
                 << " Threshold=" << AcceptanceThreshold
                 << (SizeSaving >= AcceptanceThreshold ? " [ACCEPT]" : " [REJECT]")
                 << "\n";
        }

        // 閾値チェック
        if (SizeSaving >= AcceptanceThreshold) {
          if (SizeSaving > BestScore) {
            BestScore = SizeSaving;
            BestCandidate = CB;
          }
        }
      }

      // ベストな候補が見つかれば、本番のインライン展開を実行
      if (BestCandidate) {
        Function* Callee = getDirectCallee(BestCandidate);
        std::string CalleeName = Callee->getName().str();
        std::string CallerName = F.getName().str();

        if (applyInlineAndOptimize(F, BestCandidate, FAM, false)) {
          if (TE_ENABLE_LOGGING) {
            errs() << formatv(
                "[TeInliner] Inlined {0} into {1}. Est.Saving: {2}\n",
                CalleeName, CallerName, BestScore);
          }

          // ゾンビ関数の即時破壊
          // ネストした関数の連鎖的なインライン化（A->B->C）を促進
          if (!Callee->isDeclaration()) {
            Callee->deleteBody();
          }

          MadeChange = true;
          Changed = true;
        }
      }

    } while (MadeChange);

    return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
  }
  static bool isRequired() { return true; }
};

// ============================================================================
// TeColdOptPass (Coldコードのサイズを縮小)
// ============================================================================
struct TeColdOptPass : public PassInfoMixin<TeColdOptPass> {
  PreservedAnalyses run(Module& M, ModuleAnalysisManager& MAM) {
    bool Changed = false;

    auto& FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();

    for (Function& F : M) {
      if (F.isDeclaration()) continue;

      // 条件1: PGOデータがあり、実行回数が0
      if (F.hasProfileData()) {
        auto EntryCount = F.getEntryCount();
        if (EntryCount.has_value() && EntryCount->getCount() == 0) {
          if (F.hasFnAttribute(Attribute::MinSize)) {
            continue;
          }

          // 条件2: 関数内にループが存在しない(直線的なコード)
          // ループがある場合(計算処理など)は、ベクトル化阻害のリスクがあるため除外
          // ループがない場合のみ-Oz化
          auto& LI = FAM.getResult<LoopAnalysis>(F);
          if (LI.empty()) {
            F.addFnAttr(Attribute::MinSize);
            F.addFnAttr(Attribute::OptimizeForSize);
            Changed = true;
          }
        }
      }
    }
    return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
  }
};

// シンボル(変数・関数)単位の識別子
uint64_t GetSymbolGUID(const GlobalValue& GV) {
  return GV.getGUID();
}

struct SymbolInfo {
  uint64_t Order;
  uint64_t Count;
};
// static inline llvm::DenseMap<uint64_t, uint64_t> ModuleOrderMap;
static inline llvm::DenseMap<uint64_t, SymbolInfo> SymbolInfoMap;

// ============================================================================
// TeDataLayoutPass
// ============================================================================
struct TeDataLayoutPass : public PassInfoMixin<TeDataLayoutPass> {
  const uint64_t CacheLineSize = 64;
  const uint64_t HalfCacheLineSize = CacheLineSize / 2;
  const uint64_t MinSizeToMoveCold = CacheLineSize;  // Coldデータの隔離しきい値

  const int Cutoff_UltraHot = 900000; // 90.00% (L1キャッシュ狙い: 非常に厳しい)
  const int Cutoff_Hot =      990000; // 99.00% (L2キャッシュ狙い: 標準的なHot)
  const int Cutoff_Cold =     999900; // 99.99%


  enum class HotnessRank : int {
    UltraHot = 0, // L1キャッシュ狙い (Top 0.01%)
    Hot = 1,      // L2キャッシュ狙い (Top 1% + 昇格組)
    Warm = 2,     // 標準配置 (デフォルト)
    Cold = 3      // 隔離対象 (最下位)
  };

  static constexpr const char* subsectionPrefix[] = {
    "$AA_tt_hot_",  // UltraHot
    "$AA_tt_hot_",  // Hot
    "$AA_tt_warm_", // Warm
    "$zz_tt_cold_"  // Cold
  };

  struct AffinityInfo {
    uint64_t Order;   // 最もホットな関数の順序 (Time Anchor)
    uint64_t Spread;  // 参照範囲の広さ (MaxOrder - MinOrder)
    uint64_t Freq;    // ローカルでの実行頻度 (Hotness)
  };


  struct AffinityScore {
    uint64_t Order;
    uint64_t Freq;
    bool operator==(const AffinityScore& other) const {
      return Order == other.Order && Freq == other.Freq;
    }
    bool operator!=(const AffinityScore& other) const {
      return !(*this == other);
    }
  };

  static constexpr uint64_t InvalidOrder = std::numeric_limits<uint64_t>::max();

  // 再帰的にGlobalへの参照を探索するヘルパー
  void findGlobalUsers(const Value* V,
                       llvm::SmallDenseMap<const Instruction*, uint64_t>& Users,
                       SmallPtrSetImpl<const Value*>& Visited) {
    if (!Visited.insert(V).second) return;

    for (const User* U : V->users()) {
      if (const auto* I = dyn_cast<Instruction>(U)) {
        Users[I]++;
      } else if (isa<ConstantExpr>(U) || isa<GlobalAlias>(U) ||
                 isa<GEPOperator>(U)) {
        findGlobalUsers(U, Users, Visited);
      }
    }
  }

  // データがどの実行順序グループに属すべきかを計算
  AffinityInfo calculateAffinityOrder(GlobalVariable* GV, FunctionAnalysisManager& FAM) {
    uint64_t SymbolGUID = GetSymbolGUID(*GV);
    auto ItInfo = SymbolInfoMap.find(SymbolGUID);

    // 初期値
    uint64_t BestOrder = (ItInfo != SymbolInfoMap.end()) ? ItInfo->second.Order : InvalidOrder;
    uint64_t FinalFreq = (ItInfo != SymbolInfoMap.end()) ? ItInfo->second.Count : 0;

    // ローカル参照の探索
    llvm::SmallDenseMap<const Instruction*, uint64_t> GlobalUsers;
    SmallPtrSet<const Value*, 16> Visited;
    findGlobalUsers(GV, GlobalUsers, Visited);

    uint64_t MaxLocalFuncFreq = 0;
    uint64_t LocalTotalFreq = 0;
    uint64_t MinOrder = std::numeric_limits<uint64_t>::max();
    uint64_t MaxOrder = 0;
    bool FoundAnyOrder = false;

    for (auto& Pair : GlobalUsers) {
      const Instruction* I = Pair.first;
      Function* F = const_cast<Function*>(I->getFunction());
      if (!F || F->isDeclaration()) continue;

      auto& BFI = FAM.getResult<BlockFrequencyAnalysis>(*F);
      auto CountOpt = BFI.getBlockProfileCount(I->getParent());
      uint64_t BBCount = CountOpt.has_value() ? CountOpt.value() : 0;
      uint64_t WeightedFreq = BBCount * Pair.second;

      LocalTotalFreq += WeightedFreq;

      uint64_t FuncGUID = GetSymbolGUID(*F);
      auto ItFunc = SymbolInfoMap.find(FuncGUID);
      if (ItFunc != SymbolInfoMap.end()) {
        uint64_t Order = ItFunc->second.Order;

        // Spread計算用のMin/Max更新
        if (Order < MinOrder) MinOrder = Order;
        if (Order > MaxOrder) MaxOrder = Order;
        FoundAnyOrder = true;

        // Anchor決定（最もホットな関数）
        if (WeightedFreq > MaxLocalFuncFreq) {
          MaxLocalFuncFreq = WeightedFreq;
          BestOrder = Order;
        }
      }
    }

    // Spreadの計算
    uint64_t Spread = std::numeric_limits<uint64_t>::max();
    if (FoundAnyOrder) {
      Spread = MaxOrder - MinOrder;
    }

    // 頻度の統合（ローカル優先）
    if (LocalTotalFreq > 0) {
      FinalFreq = LocalTotalFreq;
    }

    if (TE_ENABLE_AFFINITY_ORDER_LOGGING) {
      if (BestOrder == InvalidOrder) {
        if (ItInfo == SymbolInfoMap.end()) {
          // 本当のMISS
          errs() << "[TeDataLayout calculateAffinityScore] MISS!: " << GV->getName()
                 << " | Path: " << GV->getParent()->getName()
                 << "\n";
        } else {
          // 存在はするが、順序が割り当てられなかった
          errs() << "[TeDataLayout calculateAffinityScore] UNORDERED: " << GV->getName()
                 << " | Path: " << GV->getParent()->getName()
                 << "\n";
        }
      } else if (ItInfo == SymbolInfoMap.end()) {
        // 変数自体はMISSだったが、関数からの逆引きで救済された
        errs() << "[TeDataLayout calculateAffinityScore] RESCUED: "
               << GV->getName() << " | Path: " << GV->getParent()->getName()
               << " | Order: " << BestOrder << "\n";
      } else {
        errs() << "[TeDataLayout calculateAffinityScore] PERFECT: "
               << GV->getName() << " | Path: " << GV->getParent()->getName()
               << " | Order: " << BestOrder << "\n";
      }
    }

    return { BestOrder, Spread, FinalFreq };
  }

  PreservedAnalyses run(Module& M, ModuleAnalysisManager& MAM) {
    bool Changed = false;
    const DataLayout& DL = M.getDataLayout();
    ProfileSummaryInfo* PSI = &MAM.getResult<ProfileSummaryAnalysis>(M);
    if (!PSI || !PSI->hasProfileSummary()) return PreservedAnalyses::all();

    auto& FAM =
        MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();

    // ------------------------------------------------------------------------
    // Phase 0: 保護対象の収集
    // ------------------------------------------------------------------------
    SmallPtrSet<const GlobalValue*, 8> UsedGlobals;
    auto CollectUsed = [&](StringRef Name) {
      if (GlobalVariable* GV = M.getGlobalVariable(Name)) {
        if (ConstantArray* CA = dyn_cast<ConstantArray>(GV->getInitializer())) {
          for (Use& Op : CA->operands())
            UsedGlobals.insert(cast<GlobalValue>(Op->stripPointerCasts()));
        }
      }
    };
    CollectUsed("llvm.used");
    CollectUsed("llvm.compiler.used");

    auto CalcTotalCount = [&](GlobalVariable& GV) {
      uint64_t TotalCount = 0;
      for (User* U : GV.users()) {
        if (Instruction* I = dyn_cast<Instruction>(U)) {
          if (Function* F = I->getFunction()) {
            if (!F->isDeclaration() && F->hasProfileData()) {
              auto& BFI = FAM.getResult<BlockFrequencyAnalysis>(*F);
              if (auto Count = BFI.getBlockProfileCount(I->getParent()))
                TotalCount += *Count;
            }
          }
        }
      }
      return TotalCount;
    };

    // ------------------------------------------------------------------------
    // Phase 0.5: 専用アノテーションを検出
    // ------------------------------------------------------------------------
    SmallPtrSet<const GlobalValue*, 16> AllowReorderGlobals;
    SmallPtrSet<const GlobalValue*, 16> DisallowReorderGlobals;
    SmallPtrSet<const GlobalValue*, 16> AtomicIsolateGlobals;

    if (GlobalVariable* Annots =
            M.getGlobalVariable("llvm.global.annotations")) {
      if (auto* A = dyn_cast<ConstantArray>(Annots->getInitializer())) {
        for (auto& Op : A->operands()) {
          auto* AS = dyn_cast<ConstantStruct>(Op);
          if (!AS || AS->getNumOperands() < 2) continue;

          auto* GV =
              dyn_cast<GlobalValue>(AS->getOperand(0)->stripPointerCasts());
          auto* AnnotGV =
              dyn_cast<GlobalVariable>(AS->getOperand(1)->stripPointerCasts());
          if (!GV || !AnnotGV || !AnnotGV->hasInitializer()) continue;

          if (auto* AnnotData =
                  dyn_cast<ConstantDataArray>(AnnotGV->getInitializer())) {
            StringRef AnnotStr = AnnotData->getAsCString();
            if (AnnotStr == "tt_allow_reorder")
              AllowReorderGlobals.insert(GV);
            else if (AnnotStr == "tt_disallow_reorder")
              DisallowReorderGlobals.insert(GV);
            else if (AnnotStr == "tt_atomic_isolate")
              AtomicIsolateGlobals.insert(GV);
          }
        }
      }
    }

    // ------------------------------------------------------------------------
    // Phase 0.6: 専用マジックセクションを検出
    // ------------------------------------------------------------------------
    for (GlobalVariable& GV : M.globals()) {
      if (GV.hasSection()) {
        StringRef SecName = GV.getSection();
        if (SecName == ".tt_allow_reorder" ||
            SecName == ".tt_disallow_reorder" ||
            SecName == ".tt_atomic_isolate") {

          if (SecName == ".tt_allow_reorder")
            AllowReorderGlobals.insert(&GV);
          else if (SecName == ".tt_disallow_reorder")
            DisallowReorderGlobals.insert(&GV);
          else if (SecName == ".tt_atomic_isolate")
            AtomicIsolateGlobals.insert(&GV);

          GV.setSection("");
          Changed = true;
        }
      }
    }

    // ------------------------------------------------------------------------
    // 使用状況による安全性判定 (Usage Analysis)
    // ------------------------------------------------------------------------
    std::function<bool(const User*, const Value*,
                       SmallPtrSetImpl<const User*>&)> CheckUsers;

    CheckUsers = [&](const User* U, const Value* CurrentPtr,
                     SmallPtrSetImpl<const User*>& Visited) -> bool {
      if (!Visited.insert(U).second) return true;  // 再帰ループ防止

      if (isa<LoadInst>(U)) return true;  // 読み取りはOK

      if (auto* SI = dyn_cast<StoreInst>(U)) {
        // CurrentPtr への書き込み (*CurrentPtr = val) はOK
        if (SI->getPointerOperand() == CurrentPtr) return true;

        // CurrentPtr 自体が値として保存される (*addr = CurrentPtr) はNG
        // (エスケープ)
        if (SI->getValueOperand() == CurrentPtr) return false;

        // それ以外(無関係なStore)は一応安全とみなす
        return true;
      }

      // ポインタ加工命令: 結果(U)を次の CurrentPtr として再帰的にチェック
      if (auto* GEP = dyn_cast<GetElementPtrInst>(U)) {
        for (const User* GEPUser : GEP->users())
          if (!CheckUsers(GEPUser, GEP, Visited)) return false;
        return true;
      }
      if (auto* BC = dyn_cast<BitCastInst>(U)) {
        for (const User* BCUser : BC->users())
          if (!CheckUsers(BCUser, BC, Visited)) return false;
        return true;
      }
      if (auto* Sel = dyn_cast<SelectInst>(U)) {
        for (const User* SelUser : Sel->users())
          if (!CheckUsers(SelUser, Sel, Visited)) return false;
        return true;
      }
      if (auto* Phi = dyn_cast<PHINode>(U)) {
        for (const User* PhiUser : Phi->users())
          if (!CheckUsers(PhiUser, Phi, Visited)) return false;
        return true;
      }

      // その他の命令（PtrToInt, Callなど）は安全と断定できないためNG
      return false;
    };

    auto IsSafeByUsage = [&](GlobalVariable& GV) {
      // 解析コスト削減のため、定義がないものやThreadLocalは対象外
      if (GV.isDeclaration() || GV.isThreadLocal()) return false;

      SmallPtrSet<const User*, 16> Visited;
      for (const User* U : GV.users()) {
        // 初回呼び出し: CurrentPtr = &GV
        if (!CheckUsers(U, &GV, Visited)) return false;
      }
      return true;
    };

    // 構造的な絶対除外判定
    auto IsStructurallyIgnored = [&](GlobalVariable& GV) {
      // tt_disallow_reorderは拒否
      if (DisallowReorderGlobals.count(&GV)) return true;

      // DLLExportも拒否
      if (GV.hasDLLExportStorageClass()) return true;

      // tt_allow_reorderは許可
      if (AllowReorderGlobals.count(&GV)) return false;

      // tt_atomic_isolateも許可
      if (AtomicIsolateGlobals.count(&GV)) return false;

      // システム的/構造的な除外
      if (GV.hasSection() || GV.hasComdat() || GV.isThreadLocal() ||
          !GV.hasInitializer() || UsedGlobals.count(&GV)) {
        return true;
      }
      return false;
    };

    // リンケージと属性による安全性判定
    auto IsSafeLinkage = [&](GlobalVariable& GV) {
      bool Safe = GV.hasLocalLinkage() ||
                  (GV.isConstant() && GV.hasGlobalUnnamedAddr());

      // 救済ルール: アノテーション
      if (!Safe && AllowReorderGlobals.count(&GV)) Safe = true;
      if (!Safe && AtomicIsolateGlobals.count(&GV)) Safe = true;

      // 救済ルール: 使用状況分析
      if (!Safe && IsSafeByUsage(GV)) Safe = true;

      return Safe;
    };

    // ------------------------------------------------------------------------
    // Phase 1: 全変数の Hotness を事前計算
    // ------------------------------------------------------------------------
    DenseMap<GlobalVariable*, uint64_t> GVHotness;
    SmallVector<GlobalVariable*, 64> AllGVs;

    for (GlobalVariable& GV : M.globals()) {
      if (IsStructurallyIgnored(GV))
        continue;

      // Hotnessを計算
      uint64_t TotalCount = CalcTotalCount(GV);

      if (!IsSafeLinkage(GV)) {
        // レポート用の閾値
        const int Report_Cutoff = Cutoff_UltraHot;

        // ソースコードの修正で保護対象外にできる可能性のあるシンボルを列挙
        if (PSI->isHotCountNthPercentile(Report_Cutoff, TotalCount)) {
          if (TE_ENABLE_MISSED_OPPORTUNITY_LOGGING) {
            if (!GV.isConstant()) {
              errs() << "[TeDataLayout] Missed Opportunity: " << GV.getName()
                     << " (Count: " << TotalCount << ")\n"
                     << "    Reason: !GV.isConstant()\n";
            } else if (!GV.hasGlobalUnnamedAddr()) {
              errs() << "[TeDataLayout] Missed Opportunity: " << GV.getName()
                     << " (Count: " << TotalCount << ")\n"
                     << "    Reason: !GV.hasGlobalUnnamedAddr()\n";
            }
          }
        }
        continue;
      }

      GVHotness[&GV] = TotalCount;
      AllGVs.push_back(&GV);
    }

    // ------------------------------------------------------------------------
    // Phase 2: Hotness-Aware 重複排除 (Merge)
    // ------------------------------------------------------------------------
    std::sort(AllGVs.begin(), AllGVs.end(),
              [&](GlobalVariable* A, GlobalVariable* B) {
                if (GVHotness[A] != GVHotness[B])
                  return GVHotness[A] > GVHotness[B];
                return A->getName() < B->getName();
              });

    DenseMap<Constant*, GlobalVariable*> UniqueConstants;
    SmallVector<GlobalVariable*, 16> ToErase;
    for (GlobalVariable* GV : AllGVs) {
      if (!GV->isConstant() || !GV->hasLocalLinkage())
        continue;

      // 明示的に隔離指定されたものはマージしない
      if (AtomicIsolateGlobals.count(GV))
        continue;

      Constant* Init = GV->getInitializer();
      auto It = UniqueConstants.find(Init);
      if (It == UniqueConstants.end()) {
        UniqueConstants[Init] = GV;
      } else {
        GlobalVariable* HottestGV = It->second;
        if (HottestGV->getValueType() == GV->getValueType()) {
          HottestGV->setAlignment(std::max(HottestGV->getAlign().valueOrOne(),
                                           GV->getAlign().valueOrOne()));
          GV->replaceAllUsesWith(HottestGV);
          ToErase.push_back(GV);
          Changed = true;
        }
      }
    }
    for (GlobalVariable* GV : ToErase) GV->eraseFromParent();

    ToErase.clear();
    UniqueConstants.clear();
    AllGVs.clear();

    // ------------------------------------------------------------------------
    // Phase 3: 移動候補の収集 & Hotness再集計
    // ------------------------------------------------------------------------
    SmallVector<GlobalVariable*, 64> Candidates;
    for (GlobalVariable& GV : M.globals()) {
      if (IsStructurallyIgnored(GV))
        continue;

      if (!IsSafeLinkage(GV))
        continue;

      Type* Ty = GV.getValueType();
      if (!Ty->isSized())
        continue;

      uint64_t PrevCount = GVHotness.count(&GV) ? GVHotness[&GV] : 0;
      uint64_t Size = DL.getTypeAllocSize(Ty);

      // 「一度も使われていない」かつ「サイズが小さい」ものだけを除外
      if (PrevCount == 0 && Size < MinSizeToMoveCold)
        continue;

      // マージ後の参照関係で再集計
      GVHotness[&GV] = CalcTotalCount(GV);  // 最新値に更新
      Candidates.push_back(&GV);
    }

    // ------------------------------------------------------------------------
    // Phase 3.5: 各データのランク確定
    // ------------------------------------------------------------------------
    DenseMap<GlobalVariable*, HotnessRank> GVRank;
    DenseMap<GlobalVariable*, AffinityInfo> GVAffinity;
    DenseMap<GlobalVariable*, bool> IsAtomicIsolated;

    for (GlobalVariable* GV : Candidates) {
      GVHotness[GV] = CalcTotalCount(*GV);

      HotnessRank Rank = HotnessRank::Warm;
      uint64_t ExecCount = GVHotness[GV];

      // 1. 統計的頻度による初期判定
      if (PSI->isHotCountNthPercentile(Cutoff_UltraHot, ExecCount)) {
        Rank = HotnessRank::UltraHot;
      } else if (PSI->isHotCountNthPercentile(Cutoff_Hot, ExecCount)) {
        Rank = HotnessRank::Hot;
      } else if (PSI->isColdCountNthPercentile(Cutoff_Cold, ExecCount)) {
        Rank = HotnessRank::Cold;
      }

      // 2. Hotブロック参照による昇格 (WarmのものだけをHotに引き上げる)
      if (Rank == HotnessRank::Warm) {
        Function* CachedF = nullptr;
        BlockFrequencyInfo* CachedBFI = nullptr;

        for (User* U : GV->users()) {
          if (auto* I = dyn_cast<Instruction>(U)) {
            if (auto* F = I->getFunction()) {
              // 宣言のみの関数やプロファイルがない関数はスキップ（安全性のため）
              if (F->isDeclaration() || !F->hasProfileData())
                continue;

              // 関数が変わった時だけBFIを取得し直す
              if (F != CachedF) {
                CachedF = F;
                CachedBFI = &FAM.getResult<BlockFrequencyAnalysis>(*F);
              }

              if (PSI->isHotBlockNthPercentile(Cutoff_Hot, I->getParent(), CachedBFI)) {
                Rank = HotnessRank::Hot;  // Hotへ昇格
                break;  // 1つでもHotブロックがあれば即終了でOK
              }
            }
          }
        }
      }
      GVRank[GV] = Rank;

      // 親和性スコアの計算
      GVAffinity[GV] = calculateAffinityOrder(GV, FAM);

      // False Sharing防止 (明示的指定時のみ)
      if (AtomicIsolateGlobals.count(GV)) {
        IsAtomicIsolated[GV] = true;
      }
    }

    // ------------------------------------------------------------------------
    // Phase 4: ソートと物理再配置
    // ------------------------------------------------------------------------
    // ランクごとの合計データサイズを計算
    llvm::DenseMap<HotnessRank, uint64_t> RankDataSize;
    for (auto* GV : Candidates) {
      RankDataSize[GVRank[GV]] += DL.getTypeAllocSize(GV->getValueType());
    }

    if (TE_ENABLE_SORT_LOGGING) {
      errs() << "[TeDataLayout Module Sort] --- Sorting Module: "
        << M.getName()
        << " | PID: " << llvm::sys::Process::getProcessId()
        << " ---\n";
    }

    std::sort(Candidates.begin(), Candidates.end(),
              [&](GlobalVariable* A, GlobalVariable* B) -> bool {
                /* TEST2のソート
                if (GVRank[A] != GVRank[B]) return GVRank[A] < GVRank[B];

                Align AlignA = A->getAlign().valueOrOne();
                Align AlignB = B->getAlign().valueOrOne();
                if (AlignA != AlignB) return AlignA > AlignB;

                return GVHotness[A] > GVHotness[B];
                */

                if (TE_ENABLE_SORT_LOGGING) {
                  uint64_t guidA = GetSymbolGUID(*A);
                  auto itA = SymbolInfoMap.find(guidA);
                  if (itA != SymbolInfoMap.end()) {
                    llvm::errs()
                        << "[TeDataLayout Module Sort] Match! GV: "
                        << A->getName() << " | GUID: "
                        << format_hex(guidA, 16)
                        << " | Order: " << itA->second.Order
                        << "\n";

                  } else {
                    llvm::errs()
                        << "[TeDataLayout Module Sort] MISS! GV: "
                        << A->getName()
                        << " | Path: " << A->getParent()->getName()
                        << " | PID: " << llvm::sys::Process::getProcessId()
                        << "\n";
                  }
                }

                // Rank
                if (GVRank[A] != GVRank[B]) return GVRank[A] < GVRank[B];

                HotnessRank CurrentRank = GVRank[A];
                uint64_t CurrentRankSize = RankDataSize[CurrentRank];
                bool IsRankHuge = (CurrentRankSize > 4096);

                const auto& InfoA = GVAffinity[A];
                const auto& InfoB = GVAffinity[B];

                // アラインメント
                Align AlignA = A->getAlign().valueOrOne();
                Align AlignB = B->getAlign().valueOrOne();

                // サイズ
                uint64_t SizeA = DL.getTypeAllocSize(A->getValueType());
                uint64_t SizeB = DL.getTypeAllocSize(B->getValueType());

                // キャッシュライン密度
                bool IsSmallA = (SizeA <= HalfCacheLineSize);
                bool IsSmallB = (SizeB <= HalfCacheLineSize);

                // タイムバケット
                const uint64_t TimeBucketShift = 6; // 64関数単位
                uint64_t TimeA = InfoA.Order >> TimeBucketShift;
                uint64_t TimeB = InfoB.Order >> TimeBucketShift;

                // 頻度バケット (PGOカウンター値の局所性)
                auto GetFreqBucket = [](uint64_t count) -> uint64_t {
                  if (count == 0) return 0;
                  return llvm::Log2_64(count);
                };
                uint64_t FreqA = GetFreqBucket(GVHotness[A]);
                uint64_t FreqB = GetFreqBucket(GVHotness[B]);

                // 関数に対する密着度
                const uint64_t LocalSpreadThreshold = (1ULL << TimeBucketShift) * 2;
                bool IsLocalA = InfoA.Spread < LocalSpreadThreshold;
                bool IsLocalB = InfoB.Spread < LocalSpreadThreshold;

                // UltraHot/Hotかつ巨大なRank(Rustクレート等) → 時間的局所性を優先
                if (CurrentRank <= HotnessRank::Hot && IsRankHuge) {
                  // 時間軸
                  if (TimeA != TimeB) return TimeA < TimeB;

                  // キャッシュライン密度
                  if (IsSmallA != IsSmallB) return IsSmallA > IsSmallB;

                  // アライメント
                  if (AlignA != AlignB) return AlignA > AlignB;

                  // 頻度
                  if (FreqA != FreqB) return FreqA > FreqB;

                  // 関数に対する密着度
                  if (IsLocalA != IsLocalB) return IsLocalA > IsLocalB;
                }
                // Coldまたは小規模なRank → 密度優先
                else {
                  // アライメント
                  if (AlignA != AlignB) return AlignA > AlignB;

                  // 頻度
                  if (FreqA != FreqB) return FreqA > FreqB;

                  // 時間軸
                  if (TimeA != TimeB) return TimeA < TimeB;
                }

                // サイズ (昇順: 小さいものを詰めてキャッシュライン密度を上げる)
                if (SizeA != SizeB)
                  return SizeA < SizeB;

                // 頻度 (生)
                if (GVHotness[A] != GVHotness[B])
                  return GVHotness[A] > GVHotness[B];

                // 関数に対する密着度 (生)
                if (InfoA.Spread != InfoB.Spread)
                  return InfoA.Spread < InfoB.Spread;

                // CDSort順 (生)
                if (InfoA.Order != InfoB.Order)
                  return InfoA.Order < InfoB.Order;

                // タイブレーカー
                return A->getName() < B->getName();
              });

    for (GlobalVariable* GV : Candidates) {
      GV->removeFromParent();
      M.insertGlobalVariable(GV);
    }

    // ------------------------------------------------------------------------
    // Phase 5: セクション割り当て
    // ------------------------------------------------------------------------
    for (GlobalVariable* GV : Candidates) {
      HotnessRank Rank = GVRank[GV];
      bool IsAtomic = IsAtomicIsolated.lookup(GV);
      bool IsConstant = GV->isConstant();

      // アトミック変数のアラインメント調整
      if (IsAtomic && !IsConstant) {
        if (GV->getAlign().valueOrOne().value() < CacheLineSize) {
          GV->setAlignment(Align(CacheLineSize));
        }
      }

      // セクション名
      SmallString<32> SectionName;
      raw_svector_ostream OS(SectionName);
      bool needSubsection = true;

      if (IsAtomic && !IsConstant) {
        OS << ".at_data";  // アトミック変数
        needSubsection = false;
      } else if (IsConstant) {
        OS << ".rdata";
      } else {
        uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
        bool IsBSS = !IsConstant && GV->getInitializer()->isNullValue();
        bool ShouldPromote = (!IsBSS) || (Rank == HotnessRank::UltraHot && Size <= 64);

        if (ShouldPromote) {
          OS << ".data";
        } else {
          OS << "";  // BSS
          needSubsection = false;
        }
      }

      if (needSubsection) {
        OS << subsectionPrefix[(int)Rank] << (int)Rank;
      }

      GV->setSection(OS.str());

      if (IsConstant && Rank <= HotnessRank::Hot) {
        GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
      }
      Changed = true;
    }

    return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
  }
};

// ============================================================================
// TeInlinerLTOAnalysis: LLVMのLTO::runThinLTO()から呼び出す
// ============================================================================
extern "C" __declspec(dllexport) 
void TeInlinerLTOAnalysis(ModuleSummaryIndex& CombinedIndex, const lto::Config& Conf,
                          LLVMContext& Ctx, MapVector<StringRef, BitcodeModule>& ModuleMap) {
  if (TE_ENABLE_SORT_LOGGING) {
    llvm::errs() << "[TeDataLayout] TeInlinerLTOAnalysis - PID: "
                 << llvm::sys::Process::getProcessId() << "\n";
  }

  SymbolInfoMap.clear();
  // ModuleOrderMap.clear();

  std::vector<uint64_t> FuncSizes;
  std::vector<uint64_t> FuncCounts;
  std::vector<codelayout::EdgeCount> CallCounts;
  std::vector<uint64_t> CallOffsets;
  std::vector<GlobalValue::GUID> IdxToGUID;
  DenseMap<GlobalValue::GUID, uint64_t> GUIDToId;
  DenseMap<GlobalValue::GUID, uint64_t> GUIDToNodeWeight;
  uint64_t NextId = 0;

  auto GetWeight = [](CalleeInfo::HotnessType Hotness) -> uint64_t {
    switch (Hotness) {
      case CalleeInfo::HotnessType::Critical:
        return 10000;
      case CalleeInfo::HotnessType::Hot:
        return 1000;
      case CalleeInfo::HotnessType::None:
        return 100;
      case CalleeInfo::HotnessType::Cold:
        return 1;
      default:
        return 0;
    }
  };

  // エイリアスを解決して実体のGUIDを返すヘルパー
  auto GetEntityGUID = [&](ValueInfo VI) -> uint64_t {
    SmallPtrSet<uint64_t, 8> Visited;

    while (VI) {
      auto Summaries = VI.getSummaryList();
      if (Summaries.empty()) break;

      // 全サマリーをスキャンし、エイリアス以外の定義があるか確認
      const AliasSummary* AS = nullptr;
      bool HasBaseDefinition = false;

      for (auto& S : Summaries) {
        if (auto* Alias = dyn_cast<AliasSummary>(S.get())) {
          AS = Alias; // エイリアス定義
        } else {
          HasBaseDefinition = true; // 関数または変数の実体定義
          break;
        }
      }

      // 実体定義が一つでもあれば、このGUIDが最終的な実体
      if (HasBaseDefinition) return VI.getGUID();

      // 実体が無くてエイリアス定義のみが見つかった場合、その先を辿る
      if (!AS) break;

      uint64_t AliaseeGUID = AS->getAliaseeGUID();
      if (!Visited.insert(AliaseeGUID).second) break;  // 循環検出

      VI = CombinedIndex.getValueInfo(AliaseeGUID);
    }
    return VI ? VI.getGUID() : 0;
  };

  // 関数の重み(Node Weight)の集計
  for (auto& Entry : CombinedIndex) {
    for (auto& S : Entry.second.SummaryList) {
      if (auto* FS = dyn_cast<FunctionSummary>(S.get())) {
        for (auto& Call : FS->calls()) {
          uint64_t BaseWeight = GetWeight(Call.second.getHotness());
          uint64_t ScaledWeight =
              (BaseWeight * (uint64_t)Call.second.RelBlockFreq) >>
              CalleeInfo::ScaleShift;
          if (BaseWeight > 0 && ScaledWeight == 0) ScaledWeight = 1;
          
          // エイリアスを解決して実体に重みを集約
          uint64_t EntityGUID = GetEntityGUID(Call.first);
          if (EntityGUID) {
            GUIDToNodeWeight[EntityGUID] += ScaledWeight;
          }
        }
      }
    }
  }

  // ノード(関数)の登録
  for (auto& Entry : CombinedIndex) {
    FunctionSummary* BestFS = nullptr;
    for (auto& S : Entry.second.SummaryList) {
      if (auto* FS = dyn_cast<FunctionSummary>(S.get())) {
        if (!BestFS || FS->instCount() > BestFS->instCount()) BestFS = FS;
      }
    }

    if (BestFS && GUIDToId.find(Entry.first) == GUIDToId.end()) {
      GUIDToId[Entry.first] = NextId++;
      IdxToGUID.push_back(Entry.first);
      FuncSizes.push_back(static_cast<uint64_t>(BestFS->instCount()));
      FuncCounts.push_back(std::max<uint64_t>(1, GUIDToNodeWeight[Entry.first]));
    }
  }

  // 呼び出しエッジ(Edge Weight)の作成
  for (auto& Entry : CombinedIndex) {
    auto ItSrc = GUIDToId.find(Entry.first);
    if (ItSrc == GUIDToId.end()) continue;
    uint64_t SrcId = ItSrc->second;

    for (auto& S : Entry.second.SummaryList) {
      if (auto* FS = dyn_cast<FunctionSummary>(S.get())) {
        uint64_t EdgeIdx = 0;
        uint64_t NumCalls = FS->calls().size();
        uint64_t OffsetStep =
            (NumCalls > 0) ? std::max<uint64_t>(1, FS->instCount() / NumCalls) : 0;

        for (auto& Call : FS->calls()) {
          // エイリアスを解決してエッジを繋ぐ
          uint64_t EntityGUID = GetEntityGUID(Call.first);
          if (EntityGUID) {
            auto ItDst = GUIDToId.find(EntityGUID);
            if (ItDst != GUIDToId.end()) {
              uint64_t BaseWeight = GetWeight(Call.second.getHotness());
              if (BaseWeight > 0) {
                uint64_t ScaledWeight =
                  (BaseWeight * (uint64_t)Call.second.RelBlockFreq) >>
                  CalleeInfo::ScaleShift;
                if (ScaledWeight == 0) ScaledWeight = 1;
                CallCounts.push_back({ SrcId, ItDst->second, ScaledWeight });
                CallOffsets.push_back(EdgeIdx * OffsetStep);
              }
            }
          }
          EdgeIdx++;
        }
      }
    }
  }

  if (!FuncSizes.empty()) {
    // Data-level CDSort
    auto Order = codelayout::computeCacheDirectedLayout(FuncSizes, FuncCounts, CallCounts, CallOffsets);

    DenseMap<GlobalValue::GUID, uint64_t> GlobalFunctionOrder;
    for (uint64_t i = 0; i < Order.size(); ++i) {
      GlobalFunctionOrder[IdxToGUID[Order[i]]] = i;
    }

    const uint64_t InvalidOrder = std::numeric_limits<uint64_t>::max();
    struct GVMetadata {
      uint64_t TotalCount = 0;
      uint64_t BestOrder = InvalidOrder;
      uint64_t MaxFCount = 0;
    };
    DenseMap<GlobalValue::GUID, GVMetadata> GlobalGVData;

    for (auto& Entry : CombinedIndex) {
      uint64_t FGUID = Entry.first;
      uint64_t FCount = GUIDToNodeWeight.lookup(FGUID);
      for (auto& S : Entry.second.SummaryList) {
        if (auto* GVS = dyn_cast<GlobalValueSummary>(S.get())) {
          auto ItOrder = GlobalFunctionOrder.find(FGUID);
          uint64_t FOrder = (ItOrder != GlobalFunctionOrder.end()) ? ItOrder->second : InvalidOrder;

          // 全ての参照（関数→データ、データ→データ）を走査
          for (const auto& Ref : GVS->refs()) {
            auto& Data = GlobalGVData[Ref.getGUID()];
            Data.TotalCount += FCount;
            if (FCount > Data.MaxFCount && FOrder != InvalidOrder) {
              Data.MaxFCount = FCount;
              Data.BestOrder = FOrder;
            }
          }
        }
      }
    }

    std::vector<uint64_t> AllWeights;
    for (auto& Pair : GUIDToNodeWeight) AllWeights.push_back(Pair.second);
    for (auto& Entry : GlobalGVData)
      AllWeights.push_back(Entry.second.TotalCount);
    llvm::sort(AllWeights, std::greater<uint64_t>());

    // Module-level CDSort
    // Mapping module paths to IDs.
    std::vector<StringRef> IdToModulePath;
    DenseMap<StringRef, uint64_t> ModulePathToId;
    std::vector<uint64_t> ModSizes;
    std::vector<uint64_t> ModCounts;

    for (auto& Mod : ModuleMap) {
      StringRef Path = Mod.first;
      if (ModulePathToId.count(Path)) continue;

      uint64_t Id = IdToModulePath.size();
      ModulePathToId[Path] = Id;
      IdToModulePath.push_back(Path);
      ModSizes.push_back(0);
      ModCounts.push_back(0);
    }

    // Aggregate function information by module.
    // (calculate node weights and sizes)
    for (auto& Entry : CombinedIndex) {
      for (auto& S : Entry.second.SummaryList) {
        if (auto* FS = dyn_cast<FunctionSummary>(S.get())) {
          StringRef ModPath = FS->modulePath();
          if (ModulePathToId.count(ModPath)) {
            uint64_t ModId = ModulePathToId[ModPath];
            ModSizes[ModId] += FS->instCount();
            ModCounts[ModId] += GUIDToNodeWeight[Entry.first];
          }
        }
      }
    }

    // Aggregate call relationships by module.
    std::vector<codelayout::EdgeCount> ModCallCounts;
    DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> ModEdgeWeights;

    for (auto& Entry : CombinedIndex) {
      for (auto& S : Entry.second.SummaryList) {
        if (auto* FS = dyn_cast<FunctionSummary>(S.get())) {
          StringRef SrcModPath = FS->modulePath();
          if (!ModulePathToId.count(SrcModPath)) continue;

          uint64_t SrcModId = ModulePathToId[SrcModPath];
          for (auto& Call : FS->calls()) {
            // Identify the callee module.
            auto VI = CombinedIndex.getValueInfo(Call.first.getGUID());
            if (VI && !VI.getSummaryList().empty()) {
              // Adopt the first summary module (simplified).
              if (auto* DstFS = dyn_cast<FunctionSummary>(
                      VI.getSummaryList()[0].get())) {
                StringRef DstModPath = DstFS->modulePath();
                if (ModulePathToId.count(DstModPath)) {
                  uint64_t DstModId = ModulePathToId[DstModPath];
                  if (SrcModId != DstModId) {
                    uint64_t BaseWeight = GetWeight(Call.second.getHotness());
                    uint64_t ScaledWeight =
                        (BaseWeight * (uint64_t)Call.second.RelBlockFreq) >>
                        CalleeInfo::ScaleShift;
                    if (BaseWeight > 0 && ScaledWeight == 0) ScaledWeight = 1;
                    ModEdgeWeights[{SrcModId, DstModId}] += ScaledWeight;
                  }
                }
              }
            }
          }
        }
      }
    }

    // Create an edge list.
    for (auto& E : ModEdgeWeights) {
      ModCallCounts.push_back({E.first.first, E.first.second, E.second});
    }

    // Offsets are omitted (padded with zeros) as they are difficult
    // to calculate on a per-module basis.
    std::vector<uint64_t> ModCallOffsets(ModCallCounts.size(), 0);

    // Compute modlue-level CDSort.
    auto ModOrder = codelayout::computeCacheDirectedLayout(
        ModSizes, ModCounts, ModCallCounts, ModCallOffsets);

    // 関数およびその参照データの格納
    for (uint64_t i = 0; i < Order.size(); ++i) {
      uint64_t FuncGUID = IdxToGUID[Order[i]];
      ValueInfo VI = CombinedIndex.getValueInfo(FuncGUID);
      if (!VI || VI.getSummaryList().empty())
        continue;

      uint64_t Count = GUIDToNodeWeight[FuncGUID];
      SymbolInfoMap[FuncGUID] = { i, Count };
    }

    // その他のグローバルデータの格納
    for (auto& Entry : GlobalGVData) {
      uint64_t DataGUID = Entry.first;
      if (GUIDToId.count(DataGUID))
        continue;

      ValueInfo VI = CombinedIndex.getValueInfo(DataGUID);
      if (!VI || VI.getSummaryList().empty())
        continue;

      SymbolInfoMap[DataGUID] = { Entry.second.BestOrder, Entry.second.TotalCount };
    }

    // エイリアスの解決
    for (auto& Entry : CombinedIndex) {
      for (auto& S : Entry.second.SummaryList) {
        if (auto* AS = dyn_cast<AliasSummary>(S.get())) {
          uint64_t AliasGUID = Entry.first;         // エイリアスのGUID
          uint64_t BaseGUID = AS->getAliaseeGUID(); // エイリアスが指している実体のGUID

          // 実体の情報がSymbolInfoMapに存在する場合、エイリアス用にコピーしてSymbolInfoMapに登録
          auto itBase = SymbolInfoMap.find(BaseGUID);
          if (itBase != SymbolInfoMap.end()) {
            SymbolInfoMap[AliasGUID] = itBase->second;
          }
        }
      }
    }

    /*
    // モジュール単位の順序データの格納
    for (uint64_t i = 0; i < ModOrder.size(); ++i) {
      StringRef RawPath = IdToModulePath[ModOrder[i]];
      uint64_t ModGUID = GetModuleGUID(RawPath);

      ModuleOrderMap[ModGUID] = i;
    }
    */
  }
}

// ============================================================================
// TeInlinerSectionChunkSort: LLVMのWriter::createSections()から呼び出す
// ============================================================================
extern "C" __declspec(dllexport)
bool TeInlinerSectionChunkSort(StringRef SectionName, std::vector<TeChunkInfo>& chunks) {
  if (TE_ENABLE_SORT_LOGGING) {
    errs() << "[TeDataLayout Chunk Sort] --- Sorting Section: " << SectionName
      << " | PID: " << llvm::sys::Process::getProcessId()
      << " | SymbolInfoMap Size: " << SymbolInfoMap.size()
      << " ---\n";
  }


  /*
  // デバッグ
  errs() << "[TeDataLayout Chunk Sort] --- Sorting Section: " << SectionName
    << " | PID: " << llvm::sys::Process::getProcessId()
    << " | SymbolInfoMap Size: " << SymbolInfoMap.size()
    << " ---\n";

  for (auto& chunk : chunks) {
    if (chunk.Name.empty())
      continue;

    errs() << "chunk "
           << "Name: " << chunk.Name.str()
           << " | Path" << chunk.Path.str()
           << "\n";
  }
  */
  return false;



/*
  if (TE_ENABLE_SORT_LOGGING) {
    errs() << "[TeDataLayout Chunk Sort] --- Sorting Section: " << SectionName
      << " | PID: " << llvm::sys::Process::getProcessId()
      << " | SymbolInfoMap Size: " << SymbolInfoMap.size()
      << " ---\n";
  }

  if (SymbolInfoMap.empty()) return;

  struct DirStats {
    uint64_t totalSize = 0;
    uint64_t minOrder = std::numeric_limits<uint64_t>::max();
  };

  // ソート中の計算を省くためのキャッシュ構造体
  struct SortEntry {
    TeChunkInfo* Info;
    uint64_t SymbolGUID;
    uint64_t Order;
    uint64_t Count;
    StringRef DirName;
  };

  llvm::StringMap<DirStats> dirMap;
  std::vector<SortEntry> sortList;
  sortList.reserve(chunks.size());

  // 事前計算パス
  for (auto& info : chunks) {
    if (info.SymbolName.empty())
      continue;

    uint64_t SymbolGUID = GetGVGUID(info.Path, info.SymbolName);
    std::string normPath = GetNormalizedModulePath(info.Path);
    StringRef dir = sys::path::parent_path(normPath);
    auto& stats = dirMap[dir];

    auto it = SymbolInfoMap.find(SymbolGUID);

    if (TE_ENABLE_SORT_LOGGING) {
      if (it != SymbolInfoMap.end()) {
        llvm::errs() << "[TeDataLayout Chunk Sort] Match! GV: "
                     << info.SymbolName << " | GUID: " << format_hex(SymbolGUID, 16)
                     << " | Order: " << it->second.Order << "\n";
      } else {
        llvm::errs() << "[TeDataLayout Chunk Sort] MISS! GV: "
                     << info.SymbolName
                     << " | Path: " << info.Path
                     << " | PID: " << llvm::sys::Process::getProcessId()
                     << "\n";
      }
    }

    uint64_t order = (it != SymbolInfoMap.end()) ? it->second.Order : std::numeric_limits<uint64_t>::max();
    uint64_t count = (it != SymbolInfoMap.end()) ? it->second.Count : 0;

    stats.totalSize += info.Size;
    if (order < stats.minOrder) stats.minOrder = order;

    sortList.push_back({&info, SymbolGUID, order, count, dirMap.find(dir)->first()});
  }

  // 高速ソートパス
  std::stable_sort(sortList.begin(), sortList.end(), [&](const SortEntry& a, const SortEntry& b) {
        // ディレクトリ単位の並び替え
        if (a.DirName != b.DirName) {
          return dirMap.find(a.DirName)->second.minOrder < dirMap.find(b.DirName)->second.minOrder;
        }

        // ここから先は同じディレクトリ内

        // ディレクトリの統計情報を取得
        bool isDirHuge = (dirMap.find(a.DirName)->second.totalSize > 4096);
        if (isDirHuge) {
          // 巨大ディレクトリ: 時間軸優先
          uint64_t bucketA = a.Order / 64;
          uint64_t bucketB = b.Order / 64;
          if (bucketA != bucketB) return bucketA < bucketB;

          // サイズ分離 (32バイト以下を優先)
          if ((a.Info->Size <= 32) != (b.Info->Size <= 32))
            return a.Info->Size <= 32;

          if (a.Info->Alignment != b.Info->Alignment)
            return a.Info->Alignment > b.Info->Alignment;
        } else {
          // 小規模ディレクトリ: 密度優先
          if (a.Info->Alignment != b.Info->Alignment)
            return a.Info->Alignment > b.Info->Alignment;
        }

        // 共通タイブレーカー
        if (a.Order != b.Order) return a.Order < b.Order;
        if (a.Count != b.Count) return a.Count > b.Count;
        return a.Info->SymbolName < b.Info->SymbolName;
      });

  // 書き戻し
  std::vector<TeChunkInfo> sortedChunks;
  sortedChunks.reserve(chunks.size());
  for (auto& entry : sortList) {
    sortedChunks.push_back(std::move(*entry.Info));
  }
  chunks = std::move(sortedChunks);
*/
}

}  // namespace


   // Plugin registration
extern "C" LLVM_ATTRIBUTE_WEAK ::llvm::PassPluginLibraryInfo
llvmGetPassPluginInfo() {
  return {
    LLVM_PLUGIN_API_VERSION, "TeInliner", LLVM_VERSION_STRING,
    [](PassBuilder& PB) {
      auto IsLTOPreLink = [](ThinOrFullLTOPhase Phase) {
        return Phase == ThinOrFullLTOPhase::ThinLTOPreLink ||
               Phase == ThinOrFullLTOPhase::FullLTOPreLink;
      };

      // 1. Main Inliner
      PB.registerOptimizerEarlyEPCallback([&](ModulePassManager& MPM,
                                              OptimizationLevel Level,
                                              ThinOrFullLTOPhase Phase) {
        if (IsLTOPreLink(Phase)) return;

        // インライン展開
        MPM.addPass(createModuleToFunctionPassAdaptor(TeInlinerPass()));

        // インライン展開で生じたallocaをSROAでレジスタ化
        // 後続のLLVパイプライン(LoopVectorize等)が最適化を行いやすい状態にする
        FunctionPassManager LocalCleanup;
        LocalCleanup.addPass(SROAPass(SROAOptions::ModifyCFG));
        LocalCleanup.addPass(EarlyCSEPass(true));
        MPM.addPass(createModuleToFunctionPassAdaptor(std::move(LocalCleanup)));
      });

      // 2. Final Cleanup
      PB.registerOptimizerLastEPCallback([&](ModulePassManager& MPM,
                                             OptimizationLevel,
                                             ThinOrFullLTOPhase Phase) {
        if (IsLTOPreLink(Phase)) return;

        // Hot/Cold Splitting
        MPM.addPass(HotColdSplittingPass());

        MPM.addPass(MergeFunctionsPass());

        // サイズの最適化
        MPM.addPass(TeColdOptPass());

        // データの最適化 (データのセクション移動)
        // MPM.addPass(TeDataLayoutPass());

        FunctionPassManager CleanupFPM;
        CleanupFPM.addPass(SCCPPass());                       // 定数伝播
        CleanupFPM.addPass(SROAPass(SROAOptions::ModifyCFG)); // メモリ最適化
        CleanupFPM.addPass(AggressiveInstCombinePass());      // 強力な結合
        CleanupFPM.addPass(InstCombinePass());
        CleanupFPM.addPass(SimplifyCFGPass());
        CleanupFPM.addPass(ADCEPass());                       // ゴミ掃除
        MPM.addPass(createModuleToFunctionPassAdaptor(std::move(CleanupFPM)));

        // モジュール単位の残骸処理
        MPM.addPass(DeadArgumentEliminationPass());
      });
    }
  };
}
