Moved blog to dedicated page

This commit is contained in:
Yan Lin 2025-06-01 20:57:18 +02:00
parent 7b37f017ba
commit 0f4437d78c
36 changed files with 397 additions and 1631 deletions

View file

@ -5,6 +5,7 @@ primaryPublications:
- "IEEE TKDE"
- "2025"
links:
Paper: "https://ieeexplore.ieee.org/document/11004614"
Preprint: "https://arxiv.org/abs/2402.07232"
Code: "https://github.com/Logan-Lin/UVTM"
@ -115,7 +116,7 @@ secondaryPublications:
authors: "Letian Gong, Shengnan Guo, <strong>Yan Lin</strong>, Yichen Liu, Erwen Zheng, Yiwei Shuang, Youfang Lin, Jilin Hu, Huaiyu Wan"
tags:
- "IEEE TKDE"
- "2024"
- "2024"
links:
Paper: "https://ieeexplore.ieee.org/document/10836764"
@ -181,13 +182,13 @@ secondaryPublications:
Code: "https://github.com/Water2sea/WITRAN"
primaryProjects:
- title: 'Research on <i>Prediction of User Travel Destination and Travel Time Based on Trajectory Representation Learning</i>'
- title: "Research on <i>Prediction of User Travel Destination and Travel Time Based on Trajectory Representation Learning</i>"
tags:
- "Fundamental Research Funds for the Central Universities of China"
desc: "Applies representation learning to trajectory data to transform original features into high-level information, improving the performance of downstream tasks such as travel time and destination prediction."
links: {}
- title: 'Development of <i>OverleafCopilot - Empowering Academic Writing in Overleaf with Large Language Models</i>'
- title: "Development of <i>OverleafCopilot - Empowering Academic Writing in Overleaf with Large Language Models</i>"
tags:
- "Personal Interest Project"
desc: "This project aims to develop a Browser extension to seamlessly integrate Large Language Models (such as ChatGPT) into the popular online academic writing platform, Overleaf."
@ -195,7 +196,7 @@ primaryProjects:
Home: "https://www.overleafcopilot.com/"
Install: "https://chromewebstore.google.com/detail/overleaf-copilot/eoadabdpninlhkkbhngoddfjianhlghb"
- title: 'Development of <i>PromptGenius - All-purpose prompts for LLMs</i>'
- title: "Development of <i>PromptGenius - All-purpose prompts for LLMs</i>"
tags:
- "Personal Interest Project"
desc: "This project focuses on developing a website that offers a wide range of prompt categories, enhancing the versatility of LLMs for various tasks and improving their output quality."
@ -204,33 +205,33 @@ primaryProjects:
Code: "https://github.com/wenhaomin/ChatGPT-PromptGenius"
secondaryProjects:
- title: 'Research on <i>Inverse Design of Materials Using Diffusion Probabilistic Models</i>'
- title: "Research on <i>Inverse Design of Materials Using Diffusion Probabilistic Models</i>"
tags:
- "Villum Foundation"
desc: "This project focuses on developing diffusion probabilistic models to first understand the relationship between chemistry/structure and material properties, then enable the inverse design of new materials with specific properties. This project currently supports my postdoctoral research position."
links: {}
- title: 'Research on <i>Pre-training Representation Learning Methods of Spatial-temporal Trajectory Data for Traffic Prediction</i>'
- title: "Research on <i>Pre-training Representation Learning Methods of Spatial-temporal Trajectory Data for Traffic Prediction</i>"
tags:
- "National Natural Science Foundation of China"
desc: "This project aims to propose pre-training representation learning methods for spatial-temporal trajectory data, modeling multiple features to improve traffic prediction tasks. It demonstrates how trajectory representation learning can enhance traffic data mining."
links: {}
- title: 'Research on <i>Spatial-temporal Trajectory Generation and Representation Learning Methods for Sparsity Problems</i>'
- title: "Research on <i>Spatial-temporal Trajectory Generation and Representation Learning Methods for Sparsity Problems</i>"
tags:
- "National Natural Science Foundation of China"
desc: "This project explores how to generate high-quality spatial-temporal trajectory data and corresponding representations to address sparsity-related issues, thereby supporting a variety of downstream tasks."
links: {}
presentations:
- title: 'Self-supervised Learning of Trajectory Data'
- title: "Self-supervised Learning of Trajectory Data"
tags:
- "Guest lecture"
- "Aalborg University"
links:
Slides: "/assets/Self-supervised Learning of Trajectory Data.pdf"
- title: 'PLM4Traj: Leveraging Pre-trained Language Models for Cognizing Movement Patterns and Travel Purposes from Trajectories'
- title: "PLM4Traj: Leveraging Pre-trained Language Models for Cognizing Movement Patterns and Travel Purposes from Trajectories"
tags:
- "Workshop presentation"
- "KDD 2024"
@ -238,21 +239,21 @@ presentations:
Slides: "/assets/KDD_2024_Workshop_PLM4Traj.pdf"
Paper: "https://arxiv.org/abs/2405.12459"
- title: 'Origin-Destination Travel Time Oracle for Map-based Services'
- title: "Origin-Destination Travel Time Oracle for Map-based Services"
tags:
- "Paper Oral"
- "SIGMOD 2024"
links:
Slides: "/assets/SIGMOD-Oral-PPT.pdf"
- title: 'Self-supervised Learning of Spatial-temporal Trajectories'
- title: "Self-supervised Learning of Spatial-temporal Trajectories"
tags:
- "Tutorial"
- "SpatialDI 2024"
links:
Slides: "/assets/Talk on SpatialDI 2024.pdf"
- title: 'Pre-training Context and Time Aware Location Embeddings from Spatial-Temporal Trajectories for User Next Location Prediction'
- title: "Pre-training Context and Time Aware Location Embeddings from Spatial-Temporal Trajectories for User Next Location Prediction"
tags:
- "Paper Oral"
- "AAAI 2021"
@ -264,14 +265,3 @@ services:
- "Secretary of IEEE (Denmark Section) Computer Society"
- "Reviewer for journals including TIST, TII, and TVT"
- "Member of program committees of KDD, ICLR, NeurIPS, AAAI, CVPR, ICCV, IJCAI, and WWW"
blogs:
- title: "One Step Diffusion Models"
badge: "May 2025"
path: "one-step-diffusion-models"
tldr: "Despite the promising performance of diffusion models on continuous modality generation, one deficiency that is holding them back is their requirement for multi-step denoising processes, which can be computationally expensive. In this article, we examine recent works that aim to build diffusion models capable of performing sampling in one or a few steps."
- title: "Multi-modal and Multi-function Transformers"
badge: "April 2025"
path: "multi-modal-transformer"
tldr: "Multi-modal and multi-function Transformers enables a single architecture to process diverse data types such as language, images, and videos simultaneously. These models employ techniques like vector quantization and lookup-free quantization to map different modalities into a unified embedding space, allowing the Transformer to handle them within the same sequence. Beyond processing multiple data types, these architectures can also combine different functionalities-such as auto-regressive language generation and diffusion-based image creation-within a single model."

View file

@ -1,245 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Yan Lin's Blog - Multi-modal and Multi-function Transformers</title>
<link rel="icon" href="/logo.webp" type="image/x-icon">
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.7.2/font/bootstrap-icons.css" rel="stylesheet">
<link rel="stylesheet" href="/index.css">
<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
<script>
MathJax = {
tex: {
inlineMath: [['$', '$'], ['\\(', '\\)']],
displayMath: [['$$', '$$'], ['\\[', '\\]']]
},
options: {
skipHtmlTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code'],
processHtmlClass: 'arithmatex'
}
};
window.addEventListener('load', function() {
document.querySelectorAll('script[type^="math/tex"]').forEach(function(script) {
const isDisplay = script.type.includes('mode=display');
const math = script.textContent;
const span = document.createElement('span');
span.className = isDisplay ? 'mathjax-block' : 'mathjax-inline';
span.innerHTML = isDisplay ? `\\[${math}\\]` : `\\(${math}\\)`;
script.parentNode.replaceChild(span, script);
});
if (typeof MathJax !== 'undefined' && MathJax.typesetPromise) {
MathJax.typesetPromise();
}
});
</script>
<style>
a {
font-family: 'Lato', sans-serif;
}
img, .figure {
max-width: min(100%, 800px);
height: auto;
display: block;
margin-left: auto;
margin-right: auto;
}
.blog-title {
font-size: calc(1.35rem + 0.9vw);
font-weight: bold;
}
h1 {
font-size: calc(1.35rem + 0.6vw);
margin-top: 2rem;
}
h2 {
font-size: calc(1.1rem + 0.4vw);
margin-top: 1.5rem;
}
h3 {
font-size: calc(0.95rem + 0.1vw);
font-weight: bold;
margin-top: 1rem;
}
</style>
</head>
<body>
<div class="container">
<header class="border-bottom lh-1 py-3 border-secondary">
<div class="row flex-nowrap justify-content-between align-items-center">
<div class="col-2">
<a class="link-secondary header-icon px-2 h4" href="/"><i class="bi bi-house-fill"></i></a>
</div>
<div class="col-8 text-center">
<div class="page-header-logo h2 m-0 fw-bold" style="font-family: 'Abril Fatface', serif;">Yan Lin's Blog</div>
</div>
<div class="col-2 text-end">
<a class="link-secondary header-icon px-2 h4" href="/blog"><i class="bi bi-list-task"></i></a>
</div>
</div>
</header>
</div>
<main class="container">
<article class="section col-xl-10 col-xxl-9 mx-auto">
<p class="blog-title">Multi-modal and Multi-function Transformers</p>
<p><p>Transformers have gained immense popularity within deep learning and AI communities in recent years. Since their introduction in <em>Vaswani et al., "Attention Is All You Need"</em>, they have proven to be powerful sequential models across diverse domains, with thousands of variations and "improved versions." The rise of Large Language Models (LLMs), which largely use Transformers as their foundation, has led to another surge in research around this architecture. This trend has even led graph learning and Computer Vision (CV) communities to move beyond their established foundation models (i.e., GNNs and CNNs) and embrace Transformers. This explains the increasing prevalence of graph Transformers and image Transformers today.</p>
<blockquote>
<p>Han et al., “A Survey on Vision Transformer”; Khan et al., “Transformers in Vision”; Yun et al., “Graph Transformer Networks.”</p>
</blockquote>
<p>Beyond "chasing the trend," using Transformer as a unified foundation model offers several advantages:</p>
<ul>
<li>Transformers excel at capturing long-term dependencies. Unlike GNNs and CNNs which require deeper network structures for longer context, Transformers natively support global dependency modeling through their self-attention mechanism. They also avoid global smoothing and vanishing gradient problems that hinder context length scaling in other network architectures.</li>
<li>Transformers process sequences in parallel rather than sequentially, enabling full utilization of GPU acceleration. This advantage can be further enhanced with techniques like those described in <em>Dao et al., "FlashAttention."</em></li>
<li>Transformers are flexible network structures. They don't inherently enforce sequentiality—without positional encoding, the ordering of input steps to Transformers is equivalent. Through strategic permutation and positional encoding, Transformers can adapt to a wide range of structured and unstructured data.</li>
<li>The development of LLMs has made many open-weight Transformer models available with strong natural language understanding capabilities. These Transformers can be prompted and fine-tuned to model other modalities such as spatiotemporal data and images while retaining their language modeling abilities, creating opportunities for developing multi-modal foundation models.</li>
<li>From a practical perspective, using Transformer as a foundation allows reuse of technical infrastructure and optimizations developed over years, including efficient architecture designs, training pipelines, and specialized hardware.</li>
</ul>
<p>In this article, we will briefly explore techniques for unifying multiple modalities (e.g., natural language and images) and multiple functionalities (e.g., language models and diffusion denoisers) within a single Transformer. These techniques are largely sourced from recent oral papers presented at ICML, ICLR, and CVPR conferences. I assume readers have general knowledge of basic concepts in ML and neural networks, Transformers, LLMs, and diffusion models.</p>
<p>Since images and language modalities represent continuous and discrete data respectively, we will use them as examples throughout this article. Keep in mind that the techniques introduced can be readily extended to other modalities, including spatiotemporal data.</p>
<h1>General Goal</h1>
<p>The goal of a multi-modal Transformer is to create a model that can accept multi-modal inputs and produce multi-modal outputs. For example, instead of using a CNN-based image encoder and a Transformer-based language encoder to map image and language modalities to the latent space separately, a multi-modal Transformer would be able to process the combination of image and language (sentence) as a single sequence.</p>
<figure class="figure">
<img alt="image" src="/blog/md/multi-modal-transformer.assets/image.png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">An example of “conventional” multi-modal fusion. Different modality is processed by separate models and fused at some point. Source: <em>Xiang, Hao, Runsheng Xu, and Jiaqi Ma. "HM-ViT: Hetero-modal vehicle-to-vehicle cooperative perception with vision transformer." CVPR, 2023.</em></figcaption>
</figure>
<figure class="figure">
<img alt="image (1)" src="/blog/md/multi-modal-transformer.assets/image (1).png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">An example of a Transformer that can handle multi-modal inputs and outputs. Different modalities are all projected into tokens and subsequently processed by a unified Transformer encoder. Source: <em>Kondratyuk, Dan, Lijun Yu, et al. “VideoPoet: A Large Language Model for Zero-Shot Video Generation,” ICML, 2024.</em></figcaption>
</figure>
<p>Beyond multi-modal processing, a multi-function Transformer can, for example, function as both a language model (auto-regressive generation) and diffusion denoiser (score-matching generation) simultaneously, supporting two of the most common generation schemes used today.</p>
<h1>Modality Embedding</h1>
<p>A fundamental challenge in unifying multiple modalities within a single Transformer is how to represent different modalities in the same embedding space. For the "QKV" self-attention mechanism to work properly, each item in the input sequence must be represented by an embedding vector of the same dimension, matching the "model dimension" of the Transformer.</p>
<figure class="figure">
<img alt="image (2)" src="/blog/md/multi-modal-transformer.assets/image (2).png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">Illustration of the QKV self-attention mechanism in Transformer. <a href="https://en.wikipedia.org/wiki/Attention_(machine_learning)" class="link" target="_blank">Source</a></figcaption>
</figure>
<p>The most common method for mapping language into the embedding space is through tokenization and token embedding. A tokenizer maps a word or word fragment into a discrete token index, and an index-fetching embedding layer (implemented in frameworks like PyTorch with <code>nn.Embedding</code>) maps this index into a fixed-dimension embedding vector. In principle, all discrete features can be mapped into the embedding space using this approach.</p>
<figure class="figure">
<img alt="1_Dk1X5rmLomXqqTPeuHgBpw" src="/blog/md/multi-modal-transformer.assets/1_Dk1X5rmLomXqqTPeuHgBpw.png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">Visualization of tokenizer and index-fetching embedding layer. <a href="https://medium.com/@hunter-j-phillips/the-embedding-layer-27d9c980d124" class="link" target="_blank">Source</a></figcaption>
</figure>
<h2>Vector Quantization</h2>
<p>For continuous features, one intuitive approach is to first tokenize them into discrete tokens, thereby unifying the embedding process across both discrete and continuous features. <strong>Vector quantization</strong>, introduced in VQ-VAE, is one of the most common methods for this purpose.</p>
<blockquote>
<p>Van Den Oord, Aaron, and Oriol Vinyals. "Neural discrete representation learning." NeurIPS, 2017.</p>
</blockquote>
<p>Vector quantization maintains a "codebook" <script type="math/tex">\boldsymbol C \in \mathbb R^{n\times d}</script>, which functions similarly to the index-fetching embedding layer, where <script type="math/tex">n</script> is the total number of unique tokens, and <script type="math/tex">d</script> is the embedding size. A given continuous vector <script type="math/tex">\boldsymbol{z}\in\mathbb R^{d}</script> is quantized into a discrete value <script type="math/tex">i\in\mathbb [0,n-1]</script> by finding the closest row vector in <script type="math/tex">\boldsymbol C</script> to <script type="math/tex">\boldsymbol{z}</script>, and that row vector <script type="math/tex">\boldsymbol C_i</script> is fetched as the embedding for <script type="math/tex">\boldsymbol{z}</script>. Formally:
<script type="math/tex; mode=display">
i = \arg\min_j ||\boldsymbol z - \boldsymbol C_j||₂
</script>
<img alt="Screen_Shot_2020-06-28_at_4.26.40_PM" src="/blog/md/multi-modal-transformer.assets/Screen_Shot_2020-06-28_at_4.26.40_PM.png" /></p>
<h2>Lookup-Free Quantization</h2>
<p>A significant limitation of vector quantization is that it requires calculating distances between the given continuous vectors and the entire codebook, which becomes computationally expensive for large-scale codebooks. This creates tension with the need for expanded codebooks to represent complex modalities such as images and videos. Research has shown that simply increasing the number of unique tokens doesn't always improve codebook performance.</p>
<blockquote>
<p>“A simple trick for training a larger codebook involves decreasing the code embedding dimension when increasing the vocabulary size.” Source: <em>Yu, Lijun, Jose Lezama, et al. “Language Model Beats Diffusion - Tokenizer Is Key to Visual Generation,” ICLR, 2024.</em></p>
</blockquote>
<p>Building on this insight, <strong>Lookup-Free Quantization</strong> (LFQ) eliminates the embedding dimension of codebooks (essentially reducing the embedding dimension to 0) and directly calculates the discrete index <script type="math/tex">i</script> by individually quantizing each dimension of <script type="math/tex">\boldsymbol z</script> into a binary digit. The index <script type="math/tex">i</script> can then be computed by converting the binary representation to decimal. Formally:
<script type="math/tex; mode=display">
i=\sum_{j=1}^{d} 2^{(j-1)}\cdot 𝟙(z_j > 0)
</script>
</p>
<blockquote>
<p>For example, given a continuous vector <script type="math/tex">\boldsymbol z=\langle -0.52, 1.50, 0.53, -1.32\rangle</script>, we first quantize each dimension into <script type="math/tex">\langle 0, 1, 1, 0\rangle</script>, based on the sign of each dimension. The token index of <script type="math/tex">\boldsymbol z</script> is simply the decimal equivalent of the binary 0110, which is 6.</p>
</blockquote>
<p>However, this approach introduces another challenge: we still need an index-fetching embedding layer to map these token indices into embedding vectors for the Transformer. This, combined with the typically large number of unique tokens when using LFQ—a 32-dimensional <script type="math/tex">\boldsymbol z</script> will result in <script type="math/tex">2^{32}=4,294,967,296</script> unique tokens—creates significant efficiency problems. One solution is to factorize the token space. Effectively, this means splitting the binary digits into multiple parts, embedding each part separately, and concatenating the resulting embedding vectors. For example, with a 32-dimensional <script type="math/tex">\boldsymbol z</script>, if we quantize and embed its first and last 16 dimensions separately, we “only” need to handle <script type="math/tex">2^{16}*2= 131,072</script> unique tokens.</p>
<p>Note that this section doesn't extensively explain how to map raw continuous features into the vector <script type="math/tex">\boldsymbol{z}</script>, as these techniques are relatively straightforward and depend on the specific feature type—for example, fully-connected layers for numerical features, or CNN/GNN with feature flattening for structured data.</p>
<h2>Quantization over Linear Projection</h2>
<p>You might be asking—why can't we simply use linear projections to map the raw continuous features into the embedding space? What are the benefits of quantizing continuous features into discrete tokens?</p>
<p>Although Transformers are regarded as universal sequential models, they were designed for discrete tokens in their first introduction in <em>Vaswani et al., "Attention Is All You Need"</em>. Empirically, they have optimal performance when dealing with tokens, compared to continuous features. This is supported by many research papers claiming that quantizing continuous features improves the performance of Transformers, and works demonstrating Transformers' subpar performance when applied directly to continuous features.</p>
<blockquote>
<p>Mao, Chengzhi, Lu Jiang, Mostafa Dehghani, Carl Vondrick, Rahul Sukthankar, and Irfan Essa. “Discrete Representations Strengthen Vision Transformer Robustness,” ICLR, 2022.</p>
<p>Ilbert, Romain, Ambroise Odonnat, et al. “SAMformer: Unlocking the Potential of Transformers in Time Series Forecasting with Sharpness-Aware Minimization and Channel-Wise Attention,” ICML, 2024.</p>
</blockquote>
<p>On the other hand, unifying different modalities into tokens is especially beneficial in the context of Transformer-based "foundation models," since it preserves the auto-regressive next-token prediction architecture of LLMs. Combined with special tokens such as "start of sentence" and "end of sentence," the Transformer model is flexible in generating contents of mixed modalities with varied length.</p>
<blockquote>
<p>For example, by quantizing videos into discrete tokens and combining the token space of videos and language, one can create a unified Transformer model that generates both videos and language in one sequence. The start and end points of video and language sub-sequences are fully determined by the model, based on the specific input prompt. This structure would be difficult to replicate if we used tokenization for language but linear projection for videos.</p>
</blockquote>
<h1>Transformer Backbone</h1>
<p>After different modalities are mapped into the same embedding space, they can be arranged into a sequence of embedding vectors and input into a Transformer backbone. We don't discuss the variations of Transformer structure and improvement techniques here, as they are numerous, and ultimately function similarly as sequential models.</p>
<blockquote>
<p>Lan et al., “ALBERT”; Ye et al., “Differential Transformer”; Kitaev, Kaiser, and Levskaya, “Reformer”; Su et al., “RoFormer”; Dai et al., “Transformer-XL.”</p>
</blockquote>
<p>As we know, the "full" Transformer structure proposed in <em>Vaswani et al., "Attention Is All You Need"</em> includes an encoder and a decoder. They perform self-attention within their respective input sequences, and the decoder additionally performs cross-attention between its input sequence and the memory sequence derived from the encoder's output. Some early language models use encoder-only structure (like <em>Devlin et al., "BERT"</em>) focused on outputting embedding vectors or encoder-decoder structure (like <em>Chung et al., "Scaling Instruction-Finetuned Language Models"</em>) for generating natural language output. Most modern large language models and foundation models use decoder-only structure (like <em>Brown et al., "Language Models Are Few-Shot Learners"</em>), focusing on auto-regressive generation of language output.</p>
<p>The encoder-only structure theoretically excels at representation learning, and its produced embedding vectors can be applied to various downstream tasks. Recent developments have gradually moved towards decoder-only structure, centered around the idea of building models that are capable of directly generating the required final output of every downstream task.</p>
<blockquote>
<p>For example, to perform sentiment analysis, BERT will compute an embedding vector for the query sentence, and the embedding vector can be used in a dedicated classifier to predict the sentiment label. GPT, on the other hand, can directly answer the question "what is the sentiment associated with the query sentence?" Comparatively, GPT is more versatile in most cases and can easily perform zero-shot prediction.</p>
</blockquote>
<p>Nevertheless, representation learning is still a relevant topic. The general understanding is that decoder-only structure cannot perform conventional representation learning, for example mapping a sentence into a fixed-dimension embedding vector. Yet, there are a few works in the latest ICLR that shed light on the utilization of LLMs as representation learning or embedding models:</p>
<blockquote>
<p>Gao, Leo, Tom Dupre la Tour, Henk Tillman, Gabriel Goh, Rajan Troll, Alec Radford, Ilya Sutskever, Jan Leike, and Jeffrey Wu. “Scaling and Evaluating Sparse Autoencoders,” 2024. <a href="https://openreview.net/forum?id=tcsZt9ZNKD" class="link" target="_blank">Link</a></p>
<p>Li, Ziyue, and Tianyi Zhou. “Your Mixture-of-Experts LLM Is Secretly an Embedding Model for Free,” 2024. <a href="https://openreview.net/forum?id=eFGQ97z5Cd" class="link" target="_blank">Link</a></p>
<p>Zhang, Jie, Dongrui Liu, Chen Qian, Linfeng Zhang, Yong Liu, Yu Qiao, and Jing Shao. “REEF: Representation Encoding Fingerprints for Large Language Models,” 2024. <a href="https://openreview.net/forum?id=SnDmPkOJ0T" class="link" target="_blank">Link</a></p>
</blockquote>
<h1>Output Layer</h1>
<p>For language generation, Transformers typically use classifier output layers, mapping the latent vector of each item in the output sequence back to tokens. As we've established in the "modality embedding" section, the optimal method to embed continuous features is to quantize them into discrete tokens. Correspondingly, an intuitive method to output continuous features is to map these discrete tokens back to the continuous feature space, essentially reversing the vector quantization process.</p>
<h2>Reverse Vector Quantization</h2>
<p>One approach to reverse vector quantization is readily available in VQ-VAE, since it is an auto-encoder. Given a token <script type="math/tex">i</script>, we can look up its embedding in the codebook as <script type="math/tex">\boldsymbol C_i</script>, then apply a decoder network to map <script type="math/tex">\boldsymbol C_i</script> back to the continuous feature vector <script type="math/tex">\boldsymbol z</script>. The decoder network can be pre-trained in the VQ-VAE framework—pre-train the VQ-VAE tokenizer, encoder, and decoder using auto-encoding loss functions, or end-to-end trained along with the whole Transformer. In the NLP and CV communities, the pre-training approach is more popular, since there are many large-scale pre-trained auto-encoders available.</p>
<figure class="figure">
<img alt="image (4)" src="/blog/md/multi-modal-transformer.assets/image (4).png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">The encoder-decoder structure of MAGVIT (<em>Yu et al., “MAGVIT”</em>), a visual VQ-VAE model. A 3D-VQ encoder quantizes a video into discrete tokens, and a 3D-VQ decoder maps them back to the pixel space.</figcaption>
</figure>
<h2>Efficiency Enhancement</h2>
<p>For continuous feature generation, unlike language generation where the output tokens are the final output, we are essentially representing the final output with a limited size token space. Thus, for complicated continuous features like images and videos, we have to expand the token space or use more tokens to represent one image or one video frame to improve generation quality, which can result in efficiency challenges.</p>
<p>There are several workarounds to improve the efficiency of multi-modal outputs. One approach is to generate low-resolution outputs first, then use a separate super-resolution module to improve the quality of the output. This approach is explored in <em>Kondratyuk et al., "VideoPoet"</em> and <em>Tian et al., "Visual Autoregressive Modeling"</em>. Interestingly, the overall idea is very similar to nVidia's DLSS, where the graphics card renders a low-resolution frame (e.g., 1080p) using the conventional rasterization pipeline, then a super resolution model increases the frame's resolution (e.g., 4k) utilizing the graphics card's tensor hardware, improving games' overall frame rate.</p>
<p>Another workaround follows the idea of compression. Take video generation as an example. The model generates full features for key frames, and light-weight features for motion vectors that describe subtle differences from those key frames. This is essentially how inter-frame compressed video codecs work, which takes advantage of temporal redundancy between neighboring frames.</p>
<figure class="figure">
<img alt="image (5)" src="/blog/md/multi-modal-transformer.assets/image (5).png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">Keys frames and motion vectors used in <em>Jin et al., “Video-LaVIT.”</em></figcaption>
</figure>
<h1>Fuse with Diffusion Models</h1>
<p>Despite continuous efforts to enable representation and generation of images and videos with a language model structure (auto-regressive), current research indicates that diffusion models (more broadly speaking, score-matching generative models) outperform language models on continuous feature generation. Score-matching generative models have their own separate and substantial community, with strong theoretical foundations and numerous variations emerging each year, such as stochastic differential equations, bayesian flow, and rectified flow. In conclusion, score-matching generative models are clearly here to stay alongside language models.</p>
<p>An intriguing question arises: why not integrate the structures of language models and diffusion models into one Transformer to reach the best of both worlds? <em>Zhou et al. in "Transfusion"</em> explored this idea. The approach is straightforward: build a Transformer that can handle both language and image inputs and outputs. The language component functions as a language model, while the image component serves as a denoiser network for diffusion models. The model is trained by combining the language modeling loss and DDPM loss, enabling it to function either as a language model or a text-to-image denoiser.</p>
<figure class="figure">
<img alt="image (6)" src="/blog/md/multi-modal-transformer.assets/image (6).png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">A Transformer capable of function as a language model and a diffusion denoiser at the same time. Source: <em>Zhou, Chunting, Lili Yu, et al. “Transfusion: Predict the Next Token and Diffuse Images with One Multi-Modal Model,” ICLR, 2025.</em></figcaption>
</figure>
<h1>Conclusion</h1>
<p>In conclusion, the evolution of Transformers into versatile foundation models capable of handling multiple modalities and functionalities represents a significant advancement in AI research. By enabling a single architecture to process diverse data types through techniques like vector quantization and lookup-free quantization, researchers have created models that can seamlessly integrate language, images, and other modalities within the same embedding space.</p>
<p>In our research domain, we encounter even more diverse and domain-specific multi-modal data, such as traffic flows, trajectories, and real-world agent interactions. A unified Transformer for such data presents a promising solution for creating "foundation models" that generalize across diverse tasks and scenarios. However, domain-specific challenges, including data encoding and decoding, computational efficiency, and scalability, must be addressed to realize this potential.</p></p>
</article>
<p class="text-center text-secondary" style="font-size: 0.8rem; font-family: 'Lato', sans-serif;">Copyright © 2025. Designed and implemented by Yan Lin.</p>
</main>
<button id="back-to-top" class="btn btn-light rounded-circle" style="position: fixed; bottom: 20px; right: 20px; display: none; z-index: 1000; width: 40px; height: 40px; padding: 0;"><i class="bi bi-chevron-up"></i></button>
</body>
</html>
<script>
document.addEventListener('DOMContentLoaded', function() {
document.querySelectorAll('img').forEach(function(img) {
img.classList.add('figure-img', 'rounded');
});
});
// Show or hide the back-to-top button
window.addEventListener('scroll', function() {
var backToTopButton = document.getElementById('back-to-top');
if (window.scrollY > 100) {
backToTopButton.style.display = 'block';
} else {
backToTopButton.style.display = 'none';
}
});
// Scroll to top when the button is clicked
document.getElementById('back-to-top').addEventListener('click', function(e) {
e.preventDefault();
window.scrollTo({
top: 0,
behavior: 'smooth'
});
window.location.href = '#';
return false;
});
</script>

View file

@ -1,241 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Yan Lin's Blog - One Step Diffusion Models</title>
<link rel="icon" href="/logo.webp" type="image/x-icon">
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.7.2/font/bootstrap-icons.css" rel="stylesheet">
<link rel="stylesheet" href="/index.css">
<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
<script>
MathJax = {
tex: {
inlineMath: [['$', '$'], ['\\(', '\\)']],
displayMath: [['$$', '$$'], ['\\[', '\\]']]
},
options: {
skipHtmlTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code'],
processHtmlClass: 'arithmatex'
}
};
window.addEventListener('load', function() {
document.querySelectorAll('script[type^="math/tex"]').forEach(function(script) {
const isDisplay = script.type.includes('mode=display');
const math = script.textContent;
const span = document.createElement('span');
span.className = isDisplay ? 'mathjax-block' : 'mathjax-inline';
span.innerHTML = isDisplay ? `\\[${math}\\]` : `\\(${math}\\)`;
script.parentNode.replaceChild(span, script);
});
if (typeof MathJax !== 'undefined' && MathJax.typesetPromise) {
MathJax.typesetPromise();
}
});
</script>
<style>
a {
font-family: 'Lato', sans-serif;
}
img, .figure {
max-width: min(100%, 800px);
height: auto;
display: block;
margin-left: auto;
margin-right: auto;
}
.blog-title {
font-size: calc(1.35rem + 0.9vw);
font-weight: bold;
}
h1 {
font-size: calc(1.35rem + 0.6vw);
margin-top: 2rem;
}
h2 {
font-size: calc(1.1rem + 0.4vw);
margin-top: 1.5rem;
}
h3 {
font-size: calc(0.95rem + 0.1vw);
font-weight: bold;
margin-top: 1rem;
}
</style>
</head>
<body>
<div class="container">
<header class="border-bottom lh-1 py-3 border-secondary">
<div class="row flex-nowrap justify-content-between align-items-center">
<div class="col-2">
<a class="link-secondary header-icon px-2 h4" href="/"><i class="bi bi-house-fill"></i></a>
</div>
<div class="col-8 text-center">
<div class="page-header-logo h2 m-0 fw-bold" style="font-family: 'Abril Fatface', serif;">Yan Lin's Blog</div>
</div>
<div class="col-2 text-end">
<a class="link-secondary header-icon px-2 h4" href="/blog"><i class="bi bi-list-task"></i></a>
</div>
</div>
</header>
</div>
<main class="container">
<article class="section col-xl-10 col-xxl-9 mx-auto">
<p class="blog-title">One Step Diffusion Models</p>
<p><p>Despite the promising performance of diffusion models on continuous modality generation, one deficiency that is holding them back is their requirement for multi-step denoising processes, which can be computationally expensive. In this article, we examine recent works that aim to build diffusion models capable of performing sampling in one or a few steps.</p>
<hr />
<h1>Background</h1>
<p>Diffusion models (DMs), or more broadly speaking, score-matching generative models, have become the de facto framework for building deep generation models. They demonstrate exceptional generation performance, especially on continuous modalities including images, videos, audios, and spatiotemporal data.</p>
<p>Most diffusion models work by coupling a forward diffusion process and a reverse denoising diffusion process. The forward diffusion process gradually adds noise to the ground truth clean data <script type="math/tex">X_0</script>, until noisy data <script type="math/tex">X_T</script> that follows a relatively simple distribution is reached. The reverse denoising diffusion process starts from the noisy data <script type="math/tex">X_T</script>, and removes the noise component step-by-step until clean generated data <script type="math/tex">X_0</script> is reached. The reverse process is essentially a Monte-Carlo process, meaning it cannot be parallelized for each generation, which can be inefficient for a process with a large number of steps.</p>
<figure class="figure">
<img alt="image-20250503125941212" src="/blog/md/one-step-diffusion-models.assets/image-20250503125941212.png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">The two processes in a typical diffusion model. <em>Source: Ho, Jain, and Abbeel, “Denoising Diffusion Probabilistic Models.”</em></figcaption>
</figure>
<h2>Understanding DMs</h2>
<p>There are many ways to understand how Diffusion Models (DMs) work. One of the most common and intuitive approaches is that a DM learns an ordinary differential equation (ODE) that transforms noise into data. Imagine an ODE vector field between the noise <script type="math/tex">X_T</script> and clean data <script type="math/tex">X_0</script>. By training on sufficiently large numbers of timesteps <script type="math/tex">t\in [0,T]</script>, a DM is able to learn the vector (tangent) towards the cleaner data <script type="math/tex">X_{t-\Delta t}</script>, given any specific timestep <script type="math/tex">t</script> and the corresponding noisy data <script type="math/tex">X_t</script>. This idea is easy to illustrate in a simplified 1-dimensional data scenario.</p>
<figure class="figure">
<img alt="image-20250503132738122" src="/blog/md/one-step-diffusion-models.assets/image-20250503132738122.png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">Illustrated ODE flow of a diffusion model on 1-dimensional data. <em>Source: Song et al., “Score-Based Generative Modeling through Stochastic Differential Equations.”</em> It should be noted that as the figure suggests, there are differences between ODEs and DMs in a narrow sense. Flow matching models, a variant of DMs, more closely resemble ODEs.</figcaption>
</figure>
<h2>DMs Scale Poorly with Few Steps</h2>
<p>Vanilla DDPM, which is essentially a discrete-timestep DM, can only perform the reverse process using the same number of steps it is trained on, typically thousands. DDIM introduces a reparameterization scheme that enables skipping steps during the reverse process of DDPM. Continuous-timestep DMs like Stochastic Differential Equations (SDE) naturally possess the capability of using fewer steps in the reverse process compared to the forward process/training.</p>
<blockquote>
<p>Ho, Jain, and Abbeel, “Denoising Diffusion Probabilistic Models.”
Song, Meng, and Ermon, “Denoising Diffusion Implicit Models.”
Song et al., “Score-Based Generative Modeling through Stochastic Differential Equations.”</p>
</blockquote>
<p>Nevertheless, it is observed that their performance typically suffers catastrophic degradation when reducing the number of reverse process steps to single digits.</p>
<figure class="figure">
<img alt="image-20250503135351246" src="/blog/md/one-step-diffusion-models.assets/image-20250503135351246.png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">Images generated by conventional DMs with only a few steps of reverse process. <em>Source: Frans et al., “One Step Diffusion via Shortcut Models.”</em></figcaption>
</figure>
<p>To understand why DMs scale poorly with few reverse process steps, we can return to the ODE vector field perspective of DMs. When the target data distribution is complex, the vector field typically contains numerous intersections. When a given <script type="math/tex">X_t</script> and <script type="math/tex">t</script> is at these intersections, the vector points to the averaged direction of all candidates. This causes the generated data to approach the mean of the training data when only a few reverse process steps are used. Another explanation is that the learned vector field is highly curved. Using only a few reverse process steps means attempting to approximate these curves with polylines, which is inherently difficult.</p>
<figure class="figure">
<img alt="image-20250503141422791" src="/blog/md/one-step-diffusion-models.assets/image-20250503141422791.png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">Illustration of the why DMs scale poorly with few reverse process steps. <em>Source: Frans et al., “One Step Diffusion via Shortcut Models.”</em></figcaption>
</figure>
<p>We will introduce two branches of methods that aim to scale DMs to few or even reverse process steps: <strong>distillation-based</strong>, which distillates a pre-trained DM into a one-step model; and <strong>end-to-end-based</strong>, which trains a one-step DM from scratch.</p>
<h1>Distallation</h1>
<p>Distillation-based methods are also called <strong>rectified flow</strong> methods. Their idea follows the above insight of "curved ODE vector field": if the curved vectors (flows) are hindering the scaling of reverse process steps, can we try to straighten these vectors so that they are easy to approximate with polylines or even straight lines?</p>
<p><em>Liu, Gong, and Liu, "Flow Straight and Fast"</em> implements this idea, focusing on learning an ODE that follows straight vectors as much as possible. In the context of continuous time DMs where <script type="math/tex">T=1</script> and and <script type="math/tex">t\in[0,1]</script>, suppose the clean data <script type="math/tex">X_0</script> and noise <script type="math/tex">X_1</script> each follows a data distribution, <script type="math/tex">X_0\sim \pi_0</script> and <script type="math/tex">X_1\sim \pi_1</script>. The "straight vectors" can be achieved by solving a nonlinear least squares optimization problem:
<script type="math/tex; mode=display">
\min_{v} \int_{0}^{1} \mathbb{E}\left[\left\|\left(X_{1}-X_{0}\right)-v\left(X_{t}, t\right)\right\|^{2}\right] \mathrm{d} t,
</script>
</p>
<p>
<script type="math/tex; mode=display">
\quad X_{t}=t X_{1}+(1-t) X_{0}
</script>
</p>
<p>Where <script type="math/tex">v</script> is the vector field of the ODE <script type="math/tex">dZ_t = v(Z_t,t)dt</script>.</p>
<p>Though straightforward, when the clean data distribution <script type="math/tex">\pi_0</script> is very complicated, the ideal result of completely straight vectors can be hard to achieve. To address this, a "reflow" procedure is introduced. This procedure iteratively trains new rectified flows using data generated by previously obtained flows:
<script type="math/tex; mode=display">
Z^{(k+1)} = RectFlow((Z_0^k, Z_1^k))
</script>
This procedure produces increasingly straight flows that can be simulated with very few steps, ideally one step after several iterations.</p>
<figure class="figure">
<img alt="image-20250504142749208" src="/blog/md/one-step-diffusion-models.assets/image-20250504142749208.png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">Illustrations of vector fields after different times of reflow processes. <em>Source: Liu, Gong, and Liu, “Flow Straight and Fast.”</em></figcaption>
</figure>
<p>In practice, distillation-based methods are usually trained in two stages: first train a normal DM, and later distill one-step capabilities into it. This introduces additional computational overhead and complexity.</p>
<h1>End-to-end</h1>
<p>Compared to distillation-based methods, end-to-end-based methods train a one-step-capable diffusion model (DM) within a single training run. Various techniques are used to implement such methods. We will focus on two of them: <strong>consistency models</strong> and <strong>shortcut models</strong>.</p>
<h2>Consistency Models</h2>
<p>In discrete-timestep diffusion models (DMs), three components in the reverse denoising diffusion process are interchangeable through reparameterization: the noise component <script type="math/tex">\epsilon_t</script> to remove, the less noisy previous step <script type="math/tex">x_{t-1}</script>, and the predicted clean sample <script type="math/tex">x_0</script>. This interchangeability is enabled by the following equation:
<script type="math/tex; mode=display">
x_t = \sqrt{\bar{\alpha}_t} \, x_0 + \sqrt{1 - \bar{\alpha}_t} \, \epsilon_t
</script>
In theory, without altering the fundamental formulation of DMs, the learnable denoiser network can be designed to predict any of these three components. Consistency models (CMs) follow this principle by training the denoiser to specifically predict the clean sample <script type="math/tex">x_0</script>. The benefit of this approach is that CMs can naturally scale to perform the reverse process with few steps or even a single step.</p>
<figure class="figure">
<img alt="image-20250504161430743" src="/blog/md/one-step-diffusion-models.assets/image-20250504161430743.png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">A consistency model that learns to map any point on the ODE trajectory to the clean sample. <em>Source: Song et al., “Consistency Models.”</em></figcaption>
</figure>
<p>Formally, CMs learn a function <script type="math/tex">f_\theta(x_t,t)</script> that maps noisy data <script type="math/tex">x_t</script> at time <script type="math/tex">t</script> directly to the clean data <script type="math/tex">x_0</script>, satisfying:
<script type="math/tex; mode=display">
f_\theta(x_t, t) = f_\theta(x_{t'}, t') \quad \forall t, t'
</script>
The model must also obey the differential consistency condition:
<script type="math/tex; mode=display">
\frac{d}{dt} f_\theta(x_t, t) = 0
</script>
CMs are trained by minimizing the discrepancy between outputs at adjacent times, with the loss function:
<script type="math/tex; mode=display">
\mathcal{L} = \mathbb{E} \left[ d\left(f_\theta(x_t, t), f_\theta(x_{t'}, t')\right) \right]
</script>
Similar to continuous-timestep DMs and discrete-timestep DMs, CMs also have continuous-time and discrete-time variants. Discrete-time CMs are easier to train, but are more sensitive to timestep scheduling and suffer from discretization errors. Continuous-time CMs, on the other hand, suffer from instability during training.</p>
<p>For a deeper discussion of the differences between the two variants of CMs, and how to stabilize continuous-time CMs, please refer to <em>Lu and Song, "Simplifying, Stabilizing and Scaling Continuous-Time Consistency Models."</em></p>
<h2>Shortcut Models</h2>
<p>Similar to distillation-based methods, the core idea of shortcut models is inspired by the "curved vector field" problem, but the shortcut models take a different approach to solve it.</p>
<p>Shortcut models are introduced in <em>Frans et al., "One Step Diffusion via Shortcut Models."</em> The paper presents the insight that conventional DMs perform badly when jumping with large step sizes stems from their lack of awareness of the step size they are set to jump forward. Since they are only trained to comply with small step sizes, they are only learning the tangents in the curved vector field, not the "correct direction" when a large step size is used.</p>
<p>Based on this insight, on top of <script type="math/tex">x_t</script> and <script type="math/tex">t</script>, shortcut models additionally include step size <script type="math/tex">d</script> as part of the condition for the denoiser network. At small step sizes (<script type="math/tex">d\rightarrow 0</script>), the model behaves like a standard flow-matching model, learning the expected tangent from noise to data. For larger step sizes, the model learns that one large step should equal two consecutive smaller steps (self-consistency), creating a binary recursive formulation. The model is trained by combining the standard flow matching loss when <script type="math/tex">d=0</script> and the self-consistency loss when <script type="math/tex">d>0</script>:
<script type="math/tex; mode=display">
\mathcal{L} = \mathbb{E} [ \underbrace{\| s_\theta(x_t, t, 0) - (x_1 - x_0)\|^2}_{\text{Flow-Matching}} +
</script>
</p>
<p>
<script type="math/tex; mode=display">
\underbrace{\|s_\theta(x_t, t, 2d) - \mathbf{s}_{\text{target}}\|^2}_{\text{Self-Consistency}}],
</script>
</p>
<p>
<script type="math/tex; mode=display">
\quad \mathbf{s}_{\text{target}} = s_\theta(x_t, t, d)/2 + s_\theta(x'_{t+d}, t + d, d)/2 \quad
</script>
</p>
<p>
<script type="math/tex; mode=display">
\text{and} \quad x'_{t+d} = x_t + s_\theta(x_t, t, d)d
</script>
</p>
<figure class="figure">
<img alt="image-20250504180714955" src="/blog/md/one-step-diffusion-models.assets/image-20250504180714955.png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">Illustration of the training process of shortcut models. <em>Source: Frans et al., “One Step Diffusion via Shortcut Models.”</em></figcaption>
</figure>
<p>Both consistency models and shortcut models can be seamlessly scaled between one-step and multi-step generation to balance quality and efficiency.</p></p>
</article>
<p class="text-center text-secondary" style="font-size: 0.8rem; font-family: 'Lato', sans-serif;">Copyright © 2025. Designed and implemented by Yan Lin.</p>
</main>
<button id="back-to-top" class="btn btn-light rounded-circle" style="position: fixed; bottom: 20px; right: 20px; display: none; z-index: 1000; width: 40px; height: 40px; padding: 0;"><i class="bi bi-chevron-up"></i></button>
</body>
</html>
<script>
document.addEventListener('DOMContentLoaded', function() {
document.querySelectorAll('img').forEach(function(img) {
img.classList.add('figure-img', 'rounded');
});
});
// Show or hide the back-to-top button
window.addEventListener('scroll', function() {
var backToTopButton = document.getElementById('back-to-top');
if (window.scrollY > 100) {
backToTopButton.style.display = 'block';
} else {
backToTopButton.style.display = 'none';
}
});
// Scroll to top when the button is clicked
document.getElementById('back-to-top').addEventListener('click', function(e) {
e.preventDefault();
window.scrollTo({
top: 0,
behavior: 'smooth'
});
window.location.href = '#';
return false;
});
</script>

105
dist/blog/index.html vendored
View file

@ -1,105 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Yan Lin's Blog</title>
<link rel="icon" href="/logo.webp" type="image/x-icon">
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.7.2/font/bootstrap-icons.css" rel="stylesheet">
<link rel="stylesheet" href="/index.css">
</head>
<body>
<main class="container">
<header class="border-bottom lh-1 py-3 border-secondary">
<div class="row flex-nowrap justify-content-between align-items-center">
<div class="col-2">
<a class="link-secondary header-icon px-2 h4" href="/"><i class="bi bi-house-fill"></i></a>
</div>
<div class="col-8 text-center">
<div class="page-header-logo h2 m-0 fw-bold" style="font-family: 'Abril Fatface', serif;">Yan Lin's Blog</div>
</div>
<div class="col-2 text-end">
</div>
</div>
</header>
<article class="section mt-4">
<div class="list-group list-group-flush">
<div class="list-group-item px-0">
<a class="mb-1 paper-title blog-link text-decoration-none" href="/blog/html/one-step-diffusion-models.html">
One Step Diffusion Models <i class="bi bi-arrow-right-circle"></i>
</a> <span class="paper-title text-muted ms-2">May 2025</span>
<p class="card-text mb-auto tldr">Despite the promising performance of diffusion models on continuous modality generation, one deficiency that is holding them back is their requirement for multi-step denoising processes, which can be computationally expensive. In this article, we examine recent works that aim to build diffusion models capable of performing sampling in one or a few steps.</p>
</div>
<div class="list-group-item px-0">
<a class="mb-1 paper-title blog-link text-decoration-none" href="/blog/html/multi-modal-transformer.html">
Multi-modal and Multi-function Transformers <i class="bi bi-arrow-right-circle"></i>
</a> <span class="paper-title text-muted ms-2">April 2025</span>
<p class="card-text mb-auto tldr">Multi-modal and multi-function Transformers enables a single architecture to process diverse data types such as language, images, and videos simultaneously. These models employ techniques like vector quantization and lookup-free quantization to map different modalities into a unified embedding space, allowing the Transformer to handle them within the same sequence. Beyond processing multiple data types, these architectures can also combine different functionalities-such as auto-regressive language generation and diffusion-based image creation-within a single model.</p>
</div>
</div>
</article>
</main>
<footer>
<div class="container">
<p class="text-center text-secondary" style="font-size: 0.8rem; font-family: 'Lato', sans-serif;">
<span class="dark-mode-text"><i class="bi bi-moon-fill"></i> ずっと真夜中でいいのに。</span>
<span class="light-mode-text"><i class="bi bi-sun-fill"></i> ずっと正午でいいのに。</span>
<span class="mx-1">|</span>
Designed and implemented by Yan Lin.
<span class="mx-1">|</span>
<a class="link link-secondary" target="_blank" href="https://git.yanlincs.com/yanlin/Homepage">Source Code</a>
</p>
</div>
</footer>
<button id="back-to-top" class="btn btn-light rounded-circle" style="position: fixed; bottom: 20px; right: 20px; display: none; z-index: 1000; width: 40px; height: 40px; padding: 0;"><i class="bi bi-chevron-up"></i></button>
<script>
// Show or hide the back-to-top button
window.addEventListener('scroll', function() {
var backToTopButton = document.getElementById('back-to-top');
if (window.scrollY > 100) {
backToTopButton.style.display = 'block';
} else {
backToTopButton.style.display = 'none';
}
});
// Scroll to top when the button is clicked
document.getElementById('back-to-top').addEventListener('click', function(e) {
e.preventDefault();
window.scrollTo({
top: 0,
behavior: 'smooth'
});
window.location.href = '#';
return false;
});
</script>
</body>
</html>

Binary file not shown.

Before

Width:  |  Height:  |  Size: 134 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 351 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 375 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 173 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 136 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 421 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 730 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 154 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 304 KiB

View file

@ -1,148 +0,0 @@
# Multi-modal and Multi-function Transformers
Transformers have gained immense popularity within deep learning and AI communities in recent years. Since their introduction in *Vaswani et al., "Attention Is All You Need"*, they have proven to be powerful sequential models across diverse domains, with thousands of variations and "improved versions." The rise of Large Language Models (LLMs), which largely use Transformers as their foundation, has led to another surge in research around this architecture. This trend has even led graph learning and Computer Vision (CV) communities to move beyond their established foundation models (i.e., GNNs and CNNs) and embrace Transformers. This explains the increasing prevalence of graph Transformers and image Transformers today.
> Han et al., “A Survey on Vision Transformer”; Khan et al., “Transformers in Vision”; Yun et al., “Graph Transformer Networks.”
Beyond "chasing the trend," using Transformer as a unified foundation model offers several advantages:
- Transformers excel at capturing long-term dependencies. Unlike GNNs and CNNs which require deeper network structures for longer context, Transformers natively support global dependency modeling through their self-attention mechanism. They also avoid global smoothing and vanishing gradient problems that hinder context length scaling in other network architectures.
- Transformers process sequences in parallel rather than sequentially, enabling full utilization of GPU acceleration. This advantage can be further enhanced with techniques like those described in *Dao et al., "FlashAttention."*
- Transformers are flexible network structures. They don't inherently enforce sequentiality—without positional encoding, the ordering of input steps to Transformers is equivalent. Through strategic permutation and positional encoding, Transformers can adapt to a wide range of structured and unstructured data.
- The development of LLMs has made many open-weight Transformer models available with strong natural language understanding capabilities. These Transformers can be prompted and fine-tuned to model other modalities such as spatiotemporal data and images while retaining their language modeling abilities, creating opportunities for developing multi-modal foundation models.
- From a practical perspective, using Transformer as a foundation allows reuse of technical infrastructure and optimizations developed over years, including efficient architecture designs, training pipelines, and specialized hardware.
In this article, we will briefly explore techniques for unifying multiple modalities (e.g., natural language and images) and multiple functionalities (e.g., language models and diffusion denoisers) within a single Transformer. These techniques are largely sourced from recent oral papers presented at ICML, ICLR, and CVPR conferences. I assume readers have general knowledge of basic concepts in ML and neural networks, Transformers, LLMs, and diffusion models.
Since images and language modalities represent continuous and discrete data respectively, we will use them as examples throughout this article. Keep in mind that the techniques introduced can be readily extended to other modalities, including spatiotemporal data.
# General Goal
The goal of a multi-modal Transformer is to create a model that can accept multi-modal inputs and produce multi-modal outputs. For example, instead of using a CNN-based image encoder and a Transformer-based language encoder to map image and language modalities to the latent space separately, a multi-modal Transformer would be able to process the combination of image and language (sentence) as a single sequence.
![image](multi-modal-transformer.assets/image.png)
> An example of “conventional” multi-modal fusion. Different modality is processed by separate models and fused at some point. Source: *Xiang, Hao, Runsheng Xu, and Jiaqi Ma. "HM-ViT: Hetero-modal vehicle-to-vehicle cooperative perception with vision transformer." CVPR, 2023.*
![image (1)](multi-modal-transformer.assets/image (1).png)
> An example of a Transformer that can handle multi-modal inputs and outputs. Different modalities are all projected into tokens and subsequently processed by a unified Transformer encoder. Source: *Kondratyuk, Dan, Lijun Yu, et al. “VideoPoet: A Large Language Model for Zero-Shot Video Generation,” ICML, 2024.*
Beyond multi-modal processing, a multi-function Transformer can, for example, function as both a language model (auto-regressive generation) and diffusion denoiser (score-matching generation) simultaneously, supporting two of the most common generation schemes used today.
# Modality Embedding
A fundamental challenge in unifying multiple modalities within a single Transformer is how to represent different modalities in the same embedding space. For the "QKV" self-attention mechanism to work properly, each item in the input sequence must be represented by an embedding vector of the same dimension, matching the "model dimension" of the Transformer.
![image (2)](multi-modal-transformer.assets/image (2).png)
> Illustration of the QKV self-attention mechanism in Transformer. [Source](https://en.wikipedia.org/wiki/Attention_(machine_learning))
The most common method for mapping language into the embedding space is through tokenization and token embedding. A tokenizer maps a word or word fragment into a discrete token index, and an index-fetching embedding layer (implemented in frameworks like PyTorch with `nn.Embedding`) maps this index into a fixed-dimension embedding vector. In principle, all discrete features can be mapped into the embedding space using this approach.
![1_Dk1X5rmLomXqqTPeuHgBpw](multi-modal-transformer.assets/1_Dk1X5rmLomXqqTPeuHgBpw.png)
> Visualization of tokenizer and index-fetching embedding layer. [Source](https://medium.com/@hunter-j-phillips/the-embedding-layer-27d9c980d124)
## Vector Quantization
For continuous features, one intuitive approach is to first tokenize them into discrete tokens, thereby unifying the embedding process across both discrete and continuous features. **Vector quantization**, introduced in VQ-VAE, is one of the most common methods for this purpose.
> Van Den Oord, Aaron, and Oriol Vinyals. "Neural discrete representation learning." NeurIPS, 2017.
Vector quantization maintains a "codebook" $\boldsymbol C \in \mathbb R^{n\times d}$, which functions similarly to the index-fetching embedding layer, where $n$ is the total number of unique tokens, and $d$ is the embedding size. A given continuous vector $\boldsymbol{z}\in\mathbb R^{d}$ is quantized into a discrete value $i\in\mathbb [0,n-1]$ by finding the closest row vector in $\boldsymbol C$ to $\boldsymbol{z}$, and that row vector $\boldsymbol C_i$ is fetched as the embedding for $\boldsymbol{z}$. Formally:
$$
i = \arg\min_j ||\boldsymbol z - \boldsymbol C_j||₂
$$
![Screen_Shot_2020-06-28_at_4.26.40_PM](multi-modal-transformer.assets/Screen_Shot_2020-06-28_at_4.26.40_PM.png)
## Lookup-Free Quantization
A significant limitation of vector quantization is that it requires calculating distances between the given continuous vectors and the entire codebook, which becomes computationally expensive for large-scale codebooks. This creates tension with the need for expanded codebooks to represent complex modalities such as images and videos. Research has shown that simply increasing the number of unique tokens doesn't always improve codebook performance.
> “A simple trick for training a larger codebook involves decreasing the code embedding dimension when increasing the vocabulary size.” Source: *Yu, Lijun, Jose Lezama, et al. “Language Model Beats Diffusion - Tokenizer Is Key to Visual Generation,” ICLR, 2024.*
Building on this insight, **Lookup-Free Quantization** (LFQ) eliminates the embedding dimension of codebooks (essentially reducing the embedding dimension to 0) and directly calculates the discrete index $i$ by individually quantizing each dimension of $\boldsymbol z$ into a binary digit. The index $i$ can then be computed by converting the binary representation to decimal. Formally:
$$
i=\sum_{j=1}^{d} 2^{(j-1)}\cdot 𝟙(z_j > 0)
$$
> For example, given a continuous vector $\boldsymbol z=\langle -0.52, 1.50, 0.53, -1.32\rangle$, we first quantize each dimension into $\langle 0, 1, 1, 0\rangle$, based on the sign of each dimension. The token index of $\boldsymbol z$ is simply the decimal equivalent of the binary 0110, which is 6.
However, this approach introduces another challenge: we still need an index-fetching embedding layer to map these token indices into embedding vectors for the Transformer. This, combined with the typically large number of unique tokens when using LFQ—a 32-dimensional $\boldsymbol z$ will result in $2^{32}=4,294,967,296$ unique tokens—creates significant efficiency problems. One solution is to factorize the token space. Effectively, this means splitting the binary digits into multiple parts, embedding each part separately, and concatenating the resulting embedding vectors. For example, with a 32-dimensional $\boldsymbol z$, if we quantize and embed its first and last 16 dimensions separately, we “only” need to handle $2^{16}*2= 131,072$ unique tokens.
Note that this section doesn't extensively explain how to map raw continuous features into the vector $\boldsymbol{z}$, as these techniques are relatively straightforward and depend on the specific feature type—for example, fully-connected layers for numerical features, or CNN/GNN with feature flattening for structured data.
## Quantization over Linear Projection
You might be asking—why can't we simply use linear projections to map the raw continuous features into the embedding space? What are the benefits of quantizing continuous features into discrete tokens?
Although Transformers are regarded as universal sequential models, they were designed for discrete tokens in their first introduction in *Vaswani et al., "Attention Is All You Need"*. Empirically, they have optimal performance when dealing with tokens, compared to continuous features. This is supported by many research papers claiming that quantizing continuous features improves the performance of Transformers, and works demonstrating Transformers' subpar performance when applied directly to continuous features.
> Mao, Chengzhi, Lu Jiang, Mostafa Dehghani, Carl Vondrick, Rahul Sukthankar, and Irfan Essa. “Discrete Representations Strengthen Vision Transformer Robustness,” ICLR, 2022.
> Ilbert, Romain, Ambroise Odonnat, et al. “SAMformer: Unlocking the Potential of Transformers in Time Series Forecasting with Sharpness-Aware Minimization and Channel-Wise Attention,” ICML, 2024.
On the other hand, unifying different modalities into tokens is especially beneficial in the context of Transformer-based "foundation models," since it preserves the auto-regressive next-token prediction architecture of LLMs. Combined with special tokens such as "start of sentence" and "end of sentence," the Transformer model is flexible in generating contents of mixed modalities with varied length.
> For example, by quantizing videos into discrete tokens and combining the token space of videos and language, one can create a unified Transformer model that generates both videos and language in one sequence. The start and end points of video and language sub-sequences are fully determined by the model, based on the specific input prompt. This structure would be difficult to replicate if we used tokenization for language but linear projection for videos.
# Transformer Backbone
After different modalities are mapped into the same embedding space, they can be arranged into a sequence of embedding vectors and input into a Transformer backbone. We don't discuss the variations of Transformer structure and improvement techniques here, as they are numerous, and ultimately function similarly as sequential models.
> Lan et al., “ALBERT”; Ye et al., “Differential Transformer”; Kitaev, Kaiser, and Levskaya, “Reformer”; Su et al., “RoFormer”; Dai et al., “Transformer-XL.”
As we know, the "full" Transformer structure proposed in *Vaswani et al., "Attention Is All You Need"* includes an encoder and a decoder. They perform self-attention within their respective input sequences, and the decoder additionally performs cross-attention between its input sequence and the memory sequence derived from the encoder's output. Some early language models use encoder-only structure (like *Devlin et al., "BERT"*) focused on outputting embedding vectors or encoder-decoder structure (like *Chung et al., "Scaling Instruction-Finetuned Language Models"*) for generating natural language output. Most modern large language models and foundation models use decoder-only structure (like *Brown et al., "Language Models Are Few-Shot Learners"*), focusing on auto-regressive generation of language output.
The encoder-only structure theoretically excels at representation learning, and its produced embedding vectors can be applied to various downstream tasks. Recent developments have gradually moved towards decoder-only structure, centered around the idea of building models that are capable of directly generating the required final output of every downstream task.
> For example, to perform sentiment analysis, BERT will compute an embedding vector for the query sentence, and the embedding vector can be used in a dedicated classifier to predict the sentiment label. GPT, on the other hand, can directly answer the question "what is the sentiment associated with the query sentence?" Comparatively, GPT is more versatile in most cases and can easily perform zero-shot prediction.
Nevertheless, representation learning is still a relevant topic. The general understanding is that decoder-only structure cannot perform conventional representation learning, for example mapping a sentence into a fixed-dimension embedding vector. Yet, there are a few works in the latest ICLR that shed light on the utilization of LLMs as representation learning or embedding models:
> Gao, Leo, Tom Dupre la Tour, Henk Tillman, Gabriel Goh, Rajan Troll, Alec Radford, Ilya Sutskever, Jan Leike, and Jeffrey Wu. “Scaling and Evaluating Sparse Autoencoders,” 2024. [Link](https://openreview.net/forum?id=tcsZt9ZNKD)
> Li, Ziyue, and Tianyi Zhou. “Your Mixture-of-Experts LLM Is Secretly an Embedding Model for Free,” 2024. [Link](https://openreview.net/forum?id=eFGQ97z5Cd)
> Zhang, Jie, Dongrui Liu, Chen Qian, Linfeng Zhang, Yong Liu, Yu Qiao, and Jing Shao. “REEF: Representation Encoding Fingerprints for Large Language Models,” 2024. [Link](https://openreview.net/forum?id=SnDmPkOJ0T)
# Output Layer
For language generation, Transformers typically use classifier output layers, mapping the latent vector of each item in the output sequence back to tokens. As we've established in the "modality embedding" section, the optimal method to embed continuous features is to quantize them into discrete tokens. Correspondingly, an intuitive method to output continuous features is to map these discrete tokens back to the continuous feature space, essentially reversing the vector quantization process.
## Reverse Vector Quantization
One approach to reverse vector quantization is readily available in VQ-VAE, since it is an auto-encoder. Given a token $i$, we can look up its embedding in the codebook as $\boldsymbol C_i$, then apply a decoder network to map $\boldsymbol C_i$ back to the continuous feature vector $\boldsymbol z$. The decoder network can be pre-trained in the VQ-VAE framework—pre-train the VQ-VAE tokenizer, encoder, and decoder using auto-encoding loss functions, or end-to-end trained along with the whole Transformer. In the NLP and CV communities, the pre-training approach is more popular, since there are many large-scale pre-trained auto-encoders available.
![image (4)](multi-modal-transformer.assets/image (4).png)
> The encoder-decoder structure of MAGVIT (*Yu et al., “MAGVIT”*), a visual VQ-VAE model. A 3D-VQ encoder quantizes a video into discrete tokens, and a 3D-VQ decoder maps them back to the pixel space.
## Efficiency Enhancement
For continuous feature generation, unlike language generation where the output tokens are the final output, we are essentially representing the final output with a limited size token space. Thus, for complicated continuous features like images and videos, we have to expand the token space or use more tokens to represent one image or one video frame to improve generation quality, which can result in efficiency challenges.
There are several workarounds to improve the efficiency of multi-modal outputs. One approach is to generate low-resolution outputs first, then use a separate super-resolution module to improve the quality of the output. This approach is explored in *Kondratyuk et al., "VideoPoet"* and *Tian et al., "Visual Autoregressive Modeling"*. Interestingly, the overall idea is very similar to nVidia's DLSS, where the graphics card renders a low-resolution frame (e.g., 1080p) using the conventional rasterization pipeline, then a super resolution model increases the frame's resolution (e.g., 4k) utilizing the graphics card's tensor hardware, improving games' overall frame rate.
Another workaround follows the idea of compression. Take video generation as an example. The model generates full features for key frames, and light-weight features for motion vectors that describe subtle differences from those key frames. This is essentially how inter-frame compressed video codecs work, which takes advantage of temporal redundancy between neighboring frames.
![image (5)](multi-modal-transformer.assets/image (5).png)
> Keys frames and motion vectors used in *Jin et al., “Video-LaVIT.”*
# Fuse with Diffusion Models
Despite continuous efforts to enable representation and generation of images and videos with a language model structure (auto-regressive), current research indicates that diffusion models (more broadly speaking, score-matching generative models) outperform language models on continuous feature generation. Score-matching generative models have their own separate and substantial community, with strong theoretical foundations and numerous variations emerging each year, such as stochastic differential equations, bayesian flow, and rectified flow. In conclusion, score-matching generative models are clearly here to stay alongside language models.
An intriguing question arises: why not integrate the structures of language models and diffusion models into one Transformer to reach the best of both worlds? *Zhou et al. in "Transfusion"* explored this idea. The approach is straightforward: build a Transformer that can handle both language and image inputs and outputs. The language component functions as a language model, while the image component serves as a denoiser network for diffusion models. The model is trained by combining the language modeling loss and DDPM loss, enabling it to function either as a language model or a text-to-image denoiser.
![image (6)](multi-modal-transformer.assets/image (6).png)
> A Transformer capable of function as a language model and a diffusion denoiser at the same time. Source: *Zhou, Chunting, Lili Yu, et al. “Transfusion: Predict the Next Token and Diffuse Images with One Multi-Modal Model,” ICLR, 2025.*
# Conclusion
In conclusion, the evolution of Transformers into versatile foundation models capable of handling multiple modalities and functionalities represents a significant advancement in AI research. By enabling a single architecture to process diverse data types through techniques like vector quantization and lookup-free quantization, researchers have created models that can seamlessly integrate language, images, and other modalities within the same embedding space.
In our research domain, we encounter even more diverse and domain-specific multi-modal data, such as traffic flows, trajectories, and real-world agent interactions. A unified Transformer for such data presents a promising solution for creating "foundation models" that generalize across diverse tasks and scenarios. However, domain-specific challenges, including data encoding and decoding, computational efficiency, and scalability, must be addressed to realize this potential.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 275 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 584 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 918 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 902 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 297 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 748 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 874 KiB

View file

@ -1,137 +0,0 @@
# One Step Diffusion Models
Despite the promising performance of diffusion models on continuous modality generation, one deficiency that is holding them back is their requirement for multi-step denoising processes, which can be computationally expensive. In this article, we examine recent works that aim to build diffusion models capable of performing sampling in one or a few steps.
---
# Background
Diffusion models (DMs), or more broadly speaking, score-matching generative models, have become the de facto framework for building deep generation models. They demonstrate exceptional generation performance, especially on continuous modalities including images, videos, audios, and spatiotemporal data.
Most diffusion models work by coupling a forward diffusion process and a reverse denoising diffusion process. The forward diffusion process gradually adds noise to the ground truth clean data $X_0$, until noisy data $X_T$ that follows a relatively simple distribution is reached. The reverse denoising diffusion process starts from the noisy data $X_T$, and removes the noise component step-by-step until clean generated data $X_0$ is reached. The reverse process is essentially a Monte-Carlo process, meaning it cannot be parallelized for each generation, which can be inefficient for a process with a large number of steps.
![image-20250503125941212](one-step-diffusion-models.assets/image-20250503125941212.png)
> The two processes in a typical diffusion model. *Source: Ho, Jain, and Abbeel, “Denoising Diffusion Probabilistic Models.”*
## Understanding DMs
There are many ways to understand how Diffusion Models (DMs) work. One of the most common and intuitive approaches is that a DM learns an ordinary differential equation (ODE) that transforms noise into data. Imagine an ODE vector field between the noise $X_T$ and clean data $X_0$. By training on sufficiently large numbers of timesteps $t\in [0,T]$, a DM is able to learn the vector (tangent) towards the cleaner data $X_{t-\Delta t}$, given any specific timestep $t$ and the corresponding noisy data $X_t$. This idea is easy to illustrate in a simplified 1-dimensional data scenario.
![image-20250503132738122](one-step-diffusion-models.assets/image-20250503132738122.png)
> Illustrated ODE flow of a diffusion model on 1-dimensional data. *Source: Song et al., “Score-Based Generative Modeling through Stochastic Differential Equations.”* It should be noted that as the figure suggests, there are differences between ODEs and DMs in a narrow sense. Flow matching models, a variant of DMs, more closely resemble ODEs.
## DMs Scale Poorly with Few Steps
Vanilla DDPM, which is essentially a discrete-timestep DM, can only perform the reverse process using the same number of steps it is trained on, typically thousands. DDIM introduces a reparameterization scheme that enables skipping steps during the reverse process of DDPM. Continuous-timestep DMs like Stochastic Differential Equations (SDE) naturally possess the capability of using fewer steps in the reverse process compared to the forward process/training.
> Ho, Jain, and Abbeel, “Denoising Diffusion Probabilistic Models.”
> Song, Meng, and Ermon, “Denoising Diffusion Implicit Models.”
> Song et al., “Score-Based Generative Modeling through Stochastic Differential Equations.”
Nevertheless, it is observed that their performance typically suffers catastrophic degradation when reducing the number of reverse process steps to single digits.
![image-20250503135351246](one-step-diffusion-models.assets/image-20250503135351246.png)
> Images generated by conventional DMs with only a few steps of reverse process. *Source: Frans et al., “One Step Diffusion via Shortcut Models.”*
To understand why DMs scale poorly with few reverse process steps, we can return to the ODE vector field perspective of DMs. When the target data distribution is complex, the vector field typically contains numerous intersections. When a given $X_t$ and $t$ is at these intersections, the vector points to the averaged direction of all candidates. This causes the generated data to approach the mean of the training data when only a few reverse process steps are used. Another explanation is that the learned vector field is highly curved. Using only a few reverse process steps means attempting to approximate these curves with polylines, which is inherently difficult.
![image-20250503141422791](one-step-diffusion-models.assets/image-20250503141422791.png)
> Illustration of the why DMs scale poorly with few reverse process steps. *Source: Frans et al., “One Step Diffusion via Shortcut Models.”*
We will introduce two branches of methods that aim to scale DMs to few or even reverse process steps: **distillation-based**, which distillates a pre-trained DM into a one-step model; and **end-to-end-based**, which trains a one-step DM from scratch.
# Distallation
Distillation-based methods are also called **rectified flow** methods. Their idea follows the above insight of "curved ODE vector field": if the curved vectors (flows) are hindering the scaling of reverse process steps, can we try to straighten these vectors so that they are easy to approximate with polylines or even straight lines?
*Liu, Gong, and Liu, "Flow Straight and Fast"* implements this idea, focusing on learning an ODE that follows straight vectors as much as possible. In the context of continuous time DMs where $T=1$ and and $t\in[0,1]$, suppose the clean data $X_0$ and noise $X_1$ each follows a data distribution, $X_0\sim \pi_0$ and $X_1\sim \pi_1$. The "straight vectors" can be achieved by solving a nonlinear least squares optimization problem:
$$
\min_{v} \int_{0}^{1} \mathbb{E}\left[\left\|\left(X_{1}-X_{0}\right)-v\left(X_{t}, t\right)\right\|^{2}\right] \mathrm{d} t,
$$
$$
\quad X_{t}=t X_{1}+(1-t) X_{0}
$$
Where $v$ is the vector field of the ODE $dZ_t = v(Z_t,t)dt$.
Though straightforward, when the clean data distribution $\pi_0$ is very complicated, the ideal result of completely straight vectors can be hard to achieve. To address this, a "reflow" procedure is introduced. This procedure iteratively trains new rectified flows using data generated by previously obtained flows:
$$
Z^{(k+1)} = RectFlow((Z_0^k, Z_1^k))
$$
This procedure produces increasingly straight flows that can be simulated with very few steps, ideally one step after several iterations.
![image-20250504142749208](one-step-diffusion-models.assets/image-20250504142749208.png)
> Illustrations of vector fields after different times of reflow processes. *Source: Liu, Gong, and Liu, “Flow Straight and Fast.”*
In practice, distillation-based methods are usually trained in two stages: first train a normal DM, and later distill one-step capabilities into it. This introduces additional computational overhead and complexity.
# End-to-end
Compared to distillation-based methods, end-to-end-based methods train a one-step-capable diffusion model (DM) within a single training run. Various techniques are used to implement such methods. We will focus on two of them: **consistency models** and **shortcut models**.
## Consistency Models
In discrete-timestep diffusion models (DMs), three components in the reverse denoising diffusion process are interchangeable through reparameterization: the noise component $\epsilon_t$ to remove, the less noisy previous step $x_{t-1}$, and the predicted clean sample $x_0$. This interchangeability is enabled by the following equation:
$$
x_t = \sqrt{\bar{\alpha}_t} \, x_0 + \sqrt{1 - \bar{\alpha}_t} \, \epsilon_t
$$
In theory, without altering the fundamental formulation of DMs, the learnable denoiser network can be designed to predict any of these three components. Consistency models (CMs) follow this principle by training the denoiser to specifically predict the clean sample $x_0$. The benefit of this approach is that CMs can naturally scale to perform the reverse process with few steps or even a single step.
![image-20250504161430743](one-step-diffusion-models.assets/image-20250504161430743.png)
> A consistency model that learns to map any point on the ODE trajectory to the clean sample. *Source: Song et al., “Consistency Models.”*
Formally, CMs learn a function $f_\theta(x_t,t)$ that maps noisy data $x_t$ at time $t$ directly to the clean data $x_0$, satisfying:
$$
f_\theta(x_t, t) = f_\theta(x_{t'}, t') \quad \forall t, t'
$$
The model must also obey the differential consistency condition:
$$
\frac{d}{dt} f_\theta(x_t, t) = 0
$$
CMs are trained by minimizing the discrepancy between outputs at adjacent times, with the loss function:
$$
\mathcal{L} = \mathbb{E} \left[ d\left(f_\theta(x_t, t), f_\theta(x_{t'}, t')\right) \right]
$$
Similar to continuous-timestep DMs and discrete-timestep DMs, CMs also have continuous-time and discrete-time variants. Discrete-time CMs are easier to train, but are more sensitive to timestep scheduling and suffer from discretization errors. Continuous-time CMs, on the other hand, suffer from instability during training.
For a deeper discussion of the differences between the two variants of CMs, and how to stabilize continuous-time CMs, please refer to *Lu and Song, "Simplifying, Stabilizing and Scaling Continuous-Time Consistency Models."*
## Shortcut Models
Similar to distillation-based methods, the core idea of shortcut models is inspired by the "curved vector field" problem, but the shortcut models take a different approach to solve it.
Shortcut models are introduced in *Frans et al., "One Step Diffusion via Shortcut Models."* The paper presents the insight that conventional DMs perform badly when jumping with large step sizes stems from their lack of awareness of the step size they are set to jump forward. Since they are only trained to comply with small step sizes, they are only learning the tangents in the curved vector field, not the "correct direction" when a large step size is used.
Based on this insight, on top of $x_t$ and $t$, shortcut models additionally include step size $d$ as part of the condition for the denoiser network. At small step sizes ($d\rightarrow 0$), the model behaves like a standard flow-matching model, learning the expected tangent from noise to data. For larger step sizes, the model learns that one large step should equal two consecutive smaller steps (self-consistency), creating a binary recursive formulation. The model is trained by combining the standard flow matching loss when $d=0$ and the self-consistency loss when $d>0$:
$$
\mathcal{L} = \mathbb{E} [ \underbrace{\| s_\theta(x_t, t, 0) - (x_1 - x_0)\|^2}_{\text{Flow-Matching}} +
$$
$$
\underbrace{\|s_\theta(x_t, t, 2d) - \mathbf{s}_{\text{target}}\|^2}_{\text{Self-Consistency}}],
$$
$$
\quad \mathbf{s}_{\text{target}} = s_\theta(x_t, t, d)/2 + s_\theta(x'_{t+d}, t + d, d)/2 \quad
$$
$$
\text{and} \quad x'_{t+d} = x_t + s_\theta(x_t, t, d)d
$$
![image-20250504180714955](one-step-diffusion-models.assets/image-20250504180714955.png)
> Illustration of the training process of shortcut models. *Source: Frans et al., “One Step Diffusion via Shortcut Models.”*
Both consistency models and shortcut models can be seamlessly scaled between one-step and multi-step generation to balance quality and efficiency.

View file

@ -1,131 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Yan Lin's Blog - {{ title }}</title>
<link rel="icon" href="/logo.webp" type="image/x-icon">
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.7.2/font/bootstrap-icons.css" rel="stylesheet">
<link rel="stylesheet" href="/index.css">
<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
<script>
MathJax = {
tex: {
inlineMath: [['$', '$'], ['\\(', '\\)']],
displayMath: [['$$', '$$'], ['\\[', '\\]']]
},
options: {
skipHtmlTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code'],
processHtmlClass: 'arithmatex'
}
};
window.addEventListener('load', function() {
document.querySelectorAll('script[type^="math/tex"]').forEach(function(script) {
const isDisplay = script.type.includes('mode=display');
const math = script.textContent;
const span = document.createElement('span');
span.className = isDisplay ? 'mathjax-block' : 'mathjax-inline';
span.innerHTML = isDisplay ? `\\[${math}\\]` : `\\(${math}\\)`;
script.parentNode.replaceChild(span, script);
});
if (typeof MathJax !== 'undefined' && MathJax.typesetPromise) {
MathJax.typesetPromise();
}
});
</script>
<style>
a {
font-family: 'Lato', sans-serif;
}
img, .figure {
max-width: min(100%, 800px);
height: auto;
display: block;
margin-left: auto;
margin-right: auto;
}
.blog-title {
font-size: calc(1.35rem + 0.9vw);
font-weight: bold;
}
h1 {
font-size: calc(1.35rem + 0.6vw);
margin-top: 2rem;
}
h2 {
font-size: calc(1.1rem + 0.4vw);
margin-top: 1.5rem;
}
h3 {
font-size: calc(0.95rem + 0.1vw);
font-weight: bold;
margin-top: 1rem;
}
</style>
</head>
<body>
<div class="container">
<header class="border-bottom lh-1 py-3 border-secondary">
<div class="row flex-nowrap justify-content-between align-items-center">
<div class="col-2">
<a class="link-secondary header-icon px-2 h4" href="/"><i class="bi bi-house-fill"></i></a>
</div>
<div class="col-8 text-center">
<div class="page-header-logo h2 m-0 fw-bold" style="font-family: 'Abril Fatface', serif;">Yan Lin's Blog</div>
</div>
<div class="col-2 text-end">
<a class="link-secondary header-icon px-2 h4" href="/blog"><i class="bi bi-list-task"></i></a>
</div>
</div>
</header>
</div>
<main class="container">
<article class="section col-xl-10 col-xxl-9 mx-auto">
{{ content }}
</article>
<p class="text-center text-secondary" style="font-size: 0.8rem; font-family: 'Lato', sans-serif;">Copyright © 2025. Designed and implemented by Yan Lin.</p>
</main>
<button id="back-to-top" class="btn btn-light rounded-circle" style="position: fixed; bottom: 20px; right: 20px; display: none; z-index: 1000; width: 40px; height: 40px; padding: 0;"><i class="bi bi-chevron-up"></i></button>
</body>
</html>
<script>
document.addEventListener('DOMContentLoaded', function() {
document.querySelectorAll('img').forEach(function(img) {
img.classList.add('figure-img', 'rounded');
});
});
// Show or hide the back-to-top button
window.addEventListener('scroll', function() {
var backToTopButton = document.getElementById('back-to-top');
if (window.scrollY > 100) {
backToTopButton.style.display = 'block';
} else {
backToTopButton.style.display = 'none';
}
});
// Scroll to top when the button is clicked
document.getElementById('back-to-top').addEventListener('click', function(e) {
e.preventDefault();
window.scrollTo({
top: 0,
behavior: 'smooth'
});
window.location.href = '#';
return false;
});
</script>

406
dist/index.html vendored
View file

@ -36,60 +36,95 @@
</div>
</div>
<nav class="navbar navbar-expand">
<ul class="navbar-nav d-flex justify-content-evenly mx-auto gap-5">
<li class="nav-item">
<a class="link nav-link px-0" href="/#publications"><i class="bi bi-book d-inline d-md-none d-lg-inline"></i> <span class="d-none d-md-inline">Publications</span></a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#projects"><i class="bi bi-code-slash d-inline d-md-none d-lg-inline"></i> <span class="d-none d-md-inline">Projects</span></a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#presentations"><i class="bi bi-easel d-inline d-md-none d-lg-inline"></i> <span class="d-none d-md-inline">Presentations</span></a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#services"><i class="bi bi-person-lines-fill d-inline d-md-none d-lg-inline"></i> <span class="d-none d-md-inline">Services</span></a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#blog"><i class="bi bi-newspaper d-inline d-md-none d-lg-inline"></i> <span class="d-none d-md-inline">Blog</span></a>
</li>
</ul>
<nav class="navbar navbar-expand">
<ul class="navbar-nav d-flex justify-content-evenly mx-auto gap-5">
<li class="nav-item">
<a class="link nav-link px-0" href="/#publications"
><i class="bi bi-book d-inline d-md-none d-lg-inline"></i>
<span class="d-none d-md-inline">Publications</span></a
>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#projects"
><i class="bi bi-code-slash d-inline d-md-none d-lg-inline"></i>
<span class="d-none d-md-inline">Projects</span></a
>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#presentations"
><i class="bi bi-easel d-inline d-md-none d-lg-inline"></i>
<span class="d-none d-md-inline">Presentations</span></a
>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#services"
><i
class="bi bi-person-lines-fill d-inline d-md-none d-lg-inline"
></i>
<span class="d-none d-md-inline">Services</span></a
>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="https://blog.yanlincs.com"
><i class="bi bi-newspaper d-inline d-md-none d-lg-inline"></i>
<span class="d-none d-md-inline">Blog</span></a
>
</li>
</ul>
</nav>
</header>
<div class="row g-0 border rounded text-body-emphasis bg-body-secondary flex-md-row my-4 position-relative shadow-sm transition-shadow" style="transition: box-shadow 0.2s ease-in-out;" onmouseover="this.classList.remove('shadow-sm'); this.classList.add('shadow')" onmouseout="this.classList.remove('shadow'); this.classList.add('shadow-sm')">
<div
class="row g-0 border rounded text-body-emphasis bg-body-secondary flex-md-row my-4 position-relative shadow-sm transition-shadow"
style="transition: box-shadow 0.2s ease-in-out"
onmouseover="this.classList.remove('shadow-sm'); this.classList.add('shadow')"
onmouseout="this.classList.remove('shadow'); this.classList.add('shadow-sm')"
>
<div class="col p-4 d-flex flex-column position-static">
<h2 class="fst-italic mb-3">Biography - Yan Lin</h2>
<p class="card-text mb-auto" style="font-size: 1.1rem;">
I am currently a postdoctoral researcher in the Department of Computer Science at Aalborg University.
I received my PhD and Bachelor's degrees from Beijing Jiaotong University, China.
My research interests include <i>spatiotemporal data mining</i>, <i>representation learning</i>, and <i>AI for science</i>.
</p>
<h2 class="fst-italic mb-3">Biography - Yan Lin</h2>
<p class="card-text mb-auto" style="font-size: 1.1rem">
I am currently a postdoctoral researcher in the Department of
Computer Science at Aalborg University. I received my PhD and
Bachelor's degrees from Beijing Jiaotong University, China. My
research interests include <i>spatiotemporal data mining</i>,
<i>representation learning</i>, and <i>AI for science</i>.
</p>
</div>
<div class="col-5 col-xl-4 col-xxl-3 d-none d-lg-block d-flex align-items-center">
<img src="/profile.webp" alt="Yan Lin" class="rounded w-100" style="object-fit: contain;">
<div
class="col-5 col-xl-4 col-xxl-3 d-none d-lg-block d-flex align-items-center"
>
<img
src="/profile.webp"
alt="Yan Lin"
class="rounded w-100"
style="object-fit: contain"
/>
</div>
</div>
</div>
<article class="section" id="publications">
<article class="section" id="publications">
<div class="d-flex justify-content-between align-items-center mb-1">
<h2 class="section-title d-inline-block mb-0"><i class="bi bi-book"></i> Publications</h2>
<a class="mb-0 link link-secondary link-underline-opacity-0 h5" href="/publications/">View All <i class="bi bi-arrow-right-circle"></i></a>
<h2 class="section-title d-inline-block mb-0">
<i class="bi bi-book"></i> Publications
</h2>
<a
class="mb-0 link link-secondary link-underline-opacity-0 h5"
href="/publications/"
>View All <i class="bi bi-arrow-right-circle"></i
></a>
</div>
<div>
<div id="primary-publications" class="list-group list-group-flush">
<div class="list-group-item px-0">
<div id="primary-publications" class="list-group list-group-flush">
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
IEEE TKDE<span class='text-muted'> | </span>2025
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://ieeexplore.ieee.org/document/11004614" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2402.07232" target="_blank" rel="noopener noreferrer">Preprint</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/Logan-Lin/UVTM" target="_blank" rel="noopener noreferrer">Code</a>
@ -98,11 +133,7 @@
</div>
<h5 class="mb-1 paper-title">UVTM: Universal Vehicle Trajectory Modeling with ST Feature Domain Generation</h5>
<p class="card-text mb-auto author-name"><strong>Yan Lin</strong>, Jilin Hu, Shengnan Guo, Bin Yang, Christian S. Jensen, Youfang Lin, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
IJCAI<span class='text-muted'> | </span>2025
@ -117,11 +148,7 @@
</div>
<h5 class="mb-1 paper-title">TrajCogn: Leveraging LLMs for Cognizing Movement Patterns and Travel Purposes from Trajectories</h5>
<p class="card-text mb-auto author-name">Zeyu Zhou*, <strong>Yan Lin*</strong>, Haomin Wen, Shengnan Guo, Jilin Hu, Youfang Lin, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
IEEE TKDE<span class='text-muted'> | </span>2025
@ -138,11 +165,7 @@
</div>
<h5 class="mb-1 paper-title">UniTE: A Survey and Unified Pipeline for Pre-training Spatiotemporal Trajectory Embeddings</h5>
<p class="card-text mb-auto author-name"><strong>Yan Lin</strong>, Zeyu Zhou, Yicheng Liu, Haochen Lv, Haomin Wen, Tianyi Li, Yushuai Li, Christian S. Jensen, Shengnan Guo, Youfang Lin, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
WWW<span class='text-muted'> | </span>2025
@ -157,11 +180,7 @@
</div>
<h5 class="mb-1 paper-title">Path-LLM: A Multi-Modal Path Representation Learning by Aligning and Fusing with Large Language Models</h5>
<p class="card-text mb-auto author-name">Yongfu Wei*, <strong>Yan Lin*</strong>, Hongfan Gao, Ronghui Xu, Sean Bin Yang, Jilin Hu</p>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
AAAI<span class='text-muted'> | </span>2025
@ -174,11 +193,7 @@
</div>
<h5 class="mb-1 paper-title">DutyTTE: Deciphering Uncertainty in Origin-Destination Travel Time Estimation</h5>
<p class="card-text mb-auto author-name">Xiaowei Mao*, <strong>Yan Lin*</strong>, Shengnan Guo, Yubin Chen, Xingyu Xian, Haomin Wen, Qisen Xu, Youfang Lin, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
NeurIPS<span class='text-muted'> | </span>2024
@ -193,11 +208,7 @@
</div>
<h5 class="mb-1 paper-title">Mobility-LLM: Learning Visiting Intentions and Travel Preference from Human Mobility Data with Large Language Models</h5>
<p class="card-text mb-auto author-name">Letian Gong*, <strong>Yan Lin*</strong>, Xinyue Zhang, Yiwen Lu, Xuedi Han, Yichen Liu, Shengnan Guo, Youfang Lin, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
SIGMOD<span class='text-muted'> | </span>2024
@ -214,11 +225,7 @@
</div>
<h5 class="mb-1 paper-title">Origin-Destination Travel Time Oracle for Map-based Services</h5>
<p class="card-text mb-auto author-name"><strong>Yan Lin</strong>, Huaiyu Wan, Jilin Hu, Shengnan Guo, Bin Yang, Christian S. Jensen, Youfang Lin</p>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
IEEE TKDE<span class='text-muted'> | </span>2023
@ -235,11 +242,7 @@
</div>
<h5 class="mb-1 paper-title">Pre-training General Trajectory Embeddings with Maximum Multi-view Entropy Coding</h5>
<p class="card-text mb-auto author-name"><strong>Yan Lin</strong>, Huaiyu Wan, Shengnan Guo, Jilin Hu, Christian S. Jensen, Youfang Lin</p>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
IEEE TKDE<span class='text-muted'> | </span>2022
@ -254,11 +257,7 @@
</div>
<h5 class="mb-1 paper-title">Pre-training Time-aware location embeddings from spatial-temporal trajectories</h5>
<p class="card-text mb-auto author-name">Huaiyu Wan, <strong>Yan Lin</strong>, Shengnan Guo, Youfang Lin</p>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
AAAI<span class='text-muted'> | </span>2021
@ -273,15 +272,11 @@
</div>
<h5 class="mb-1 paper-title">Pre-training Context and Time Aware Location Embeddings from Spatial-Temporal Trajectories for User Next Location Prediction</h5>
<p class="card-text mb-auto author-name"><strong>Yan Lin</strong>, Huaiyu Wan, Shengnan Guo, Youfang Lin</p>
</div>
</div>
<hr class="my-2">
<div id="secondary-publications" class="list-group list-group-flush">
<div class="list-group-item px-0">
</div>
</div>
<hr class="my-2" />
<div id="secondary-publications" class="list-group list-group-flush">
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
KDD<span class='text-muted'> | </span>2025
@ -296,11 +291,7 @@
</div>
<h5 class="mb-1 paper-title">DUET: Dual Clustering Enhanced Multivariate Time Series Forecasting</h5>
<p class="card-text mb-auto author-name">Xiangfei Qiu, Xingjian Wu, <strong>Yan Lin</strong>, Chenjuan Guo, Jilin Hu, Bin Yang</p>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
IEEE TKDE<span class='text-muted'> | </span>2024
@ -317,11 +308,7 @@
</div>
<h5 class="mb-1 paper-title">Diff-RNTraj: A Structure-aware Diffusion Model for Road Network-constrained Trajectory Generation</h5>
<p class="card-text mb-auto author-name">Tonglong Wei, Youfang Lin, Shengnan Guo, <strong>Yan Lin</strong>, Yiheng Huang, Chenyang Xiang, Yuqing Bai, Menglu Ya, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
IEEE TKDE<span class='text-muted'> | </span>2024
@ -334,11 +321,7 @@
</div>
<h5 class="mb-1 paper-title">STCDM: Spatio-Temporal Contrastive Diffusion Model for Check-In Sequence Generation</h5>
<p class="card-text mb-auto author-name">Letian Gong, Shengnan Guo, <strong>Yan Lin</strong>, Yichen Liu, Erwen Zheng, Yiwei Shuang, Youfang Lin, Jilin Hu, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
IEEE TKDE<span class='text-muted'> | </span>2024
@ -355,11 +338,7 @@
</div>
<h5 class="mb-1 paper-title">Micro-Macro Spatial-Temporal Graph-based Encoder-Decoder for Map-Constrained Trajectory Recovery</h5>
<p class="card-text mb-auto author-name">Tonglong Wei, Youfang Lin, <strong>Yan Lin</strong>, Shengnan Guo, Lan Zhang, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
KBS<span class='text-muted'> | </span>2024
@ -374,11 +353,7 @@
</div>
<h5 class="mb-1 paper-title">Inductive and Adaptive Graph Convolution Networks Equipped with Constraint Task for Spatial-Temporal Traffic Data Kriging</h5>
<p class="card-text mb-auto author-name">Tonglong Wei, Youfang Lin, Shengnan Guo, <strong>Yan Lin</strong>, Yiji Zhao, Xiyuan Jin, Zhihao Wu, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
IEEE TKDE<span class='text-muted'> | </span>2024
@ -391,11 +366,7 @@
</div>
<h5 class="mb-1 paper-title">Spatial-Temporal Cross-View Contrastive Pre-Training for Check-in Sequence Representation Learning</h5>
<p class="card-text mb-auto author-name">Letian Gong, Huaiyu Wan, Shengnan Guo, Li Xiucheng, <strong>Yan Lin</strong>, Erwen Zheng, Tianyi Wang, Zeyu Zhou, Youfang Lin</p>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
AAAI<span class='text-muted'> | </span>2023
@ -410,11 +381,7 @@
</div>
<h5 class="mb-1 paper-title">Contrastive Pre-training with Adversarial Perturbations for Check-In Sequence Representation Learning</h5>
<p class="card-text mb-auto author-name">Letian Gong, Youfang Lin, Shengnan Guo, <strong>Yan Lin</strong>, Tianyi Wang, Erwen Zheng, Zeyu Zhou, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
ESWA<span class='text-muted'> | </span>2023
@ -427,11 +394,7 @@
</div>
<h5 class="mb-1 paper-title">Adversarial Self-Attentive Time-Variant Neural Networks for Multi-Step Time Series Forecasting</h5>
<p class="card-text mb-auto author-name">Changxia Gao, Ning Zhang, Youru Li, <strong>Yan Lin</strong>, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
APIN<span class='text-muted'> | </span>2023
@ -444,11 +407,7 @@
</div>
<h5 class="mb-1 paper-title">Multi-scale Adaptive Attention-based Time-Variant Neural Networks for Multi-step Time Series Forecasting</h5>
<p class="card-text mb-auto author-name">Changxia Gao, Ning Zhang, Youru Li, <strong>Yan Lin</strong>, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
NeurIPS<span class='text-muted'> | </span>2023
@ -463,26 +422,30 @@
</div>
<h5 class="mb-1 paper-title">WITRAN: Water-wave Information Transmission and Recurrent Acceleration Network for Long-range Time Series Forecasting</h5>
<p class="card-text mb-auto author-name">Yuxin Jia, Youfang Lin, Xinyan Hao, <strong>Yan Lin</strong>, Shengnan Guo, Huaiyu Wan</p>
</div>
</div>
</div>
</div>
</div>
<div class="text-start mt-1">
<small class="text-muted" style="font-size: 0.8rem;">* Equal Contribution</small>
<small class="text-muted" style="font-size: 0.8rem"
>* Equal Contribution</small
>
</div>
</article>
</article>
<article class="section" id="projects">
<article class="section" id="projects">
<div class="d-flex justify-content-between align-items-center mb-1">
<h2 class="section-title d-inline-block mb-0"><i class="bi bi-code-slash"></i> Projects</h2>
<a class="mb-0 link link-secondary link-underline-opacity-0 h5" href="/projects/">View All <i class="bi bi-arrow-right-circle"></i></a>
<h2 class="section-title d-inline-block mb-0">
<i class="bi bi-code-slash"></i> Projects
</h2>
<a
class="mb-0 link link-secondary link-underline-opacity-0 h5"
href="/projects/"
>View All <i class="bi bi-arrow-right-circle"></i
></a>
</div>
<div>
<div id="primary-projects" class="list-group list-group-flush">
<div class="list-group-item px-0">
<div id="primary-projects" class="list-group list-group-flush">
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Fundamental Research Funds for the Central Universities of China
@ -493,11 +456,7 @@
</div>
<h5 class="mb-1 paper-title">Research on <i>Prediction of User Travel Destination and Travel Time Based on Trajectory Representation Learning</i></h5>
<p class="card-text mb-auto project-desc">Applies representation learning to trajectory data to transform original features into high-level information, improving the performance of downstream tasks such as travel time and destination prediction.</p>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Personal Interest Project
@ -512,11 +471,7 @@
</div>
<h5 class="mb-1 paper-title">Development of <i>OverleafCopilot - Empowering Academic Writing in Overleaf with Large Language Models</i></h5>
<p class="card-text mb-auto project-desc">This project aims to develop a Browser extension to seamlessly integrate Large Language Models (such as ChatGPT) into the popular online academic writing platform, Overleaf.</p>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Personal Interest Project
@ -531,15 +486,11 @@
</div>
<h5 class="mb-1 paper-title">Development of <i>PromptGenius - All-purpose prompts for LLMs</i></h5>
<p class="card-text mb-auto project-desc">This project focuses on developing a website that offers a wide range of prompt categories, enhancing the versatility of LLMs for various tasks and improving their output quality.</p>
</div>
</div>
<hr class="my-2">
<div id="secondary-projects" class="list-group list-group-flush">
<div class="list-group-item px-0">
</div>
</div>
<hr class="my-2" />
<div id="secondary-projects" class="list-group list-group-flush">
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
Villum Foundation
@ -550,11 +501,7 @@
</div>
<h5 class="mb-1 paper-title">Research on <i>Inverse Design of Materials Using Diffusion Probabilistic Models</i></h5>
<p class="card-text mb-auto project-desc">This project focuses on developing diffusion probabilistic models to first understand the relationship between chemistry/structure and material properties, then enable the inverse design of new materials with specific properties. This project currently supports my postdoctoral research position.</p>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
National Natural Science Foundation of China
@ -565,11 +512,7 @@
</div>
<h5 class="mb-1 paper-title">Research on <i>Pre-training Representation Learning Methods of Spatial-temporal Trajectory Data for Traffic Prediction</i></h5>
<p class="card-text mb-auto project-desc">This project aims to propose pre-training representation learning methods for spatial-temporal trajectory data, modeling multiple features to improve traffic prediction tasks. It demonstrates how trajectory representation learning can enhance traffic data mining.</p>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
National Natural Science Foundation of China
@ -580,21 +523,24 @@
</div>
<h5 class="mb-1 paper-title">Research on <i>Spatial-temporal Trajectory Generation and Representation Learning Methods for Sparsity Problems</i></h5>
<p class="card-text mb-auto project-desc">This project explores how to generate high-quality spatial-temporal trajectory data and corresponding representations to address sparsity-related issues, thereby supporting a variety of downstream tasks.</p>
</div>
</div>
</div>
</div>
</div>
</article>
</article>
<article class="section" id="presentations">
<article class="section" id="presentations">
<div class="d-flex justify-content-between align-items-center mb-1">
<h2 class="section-title d-inline-block mb-0"><i class="bi bi-easel"></i> Presentations</h2>
<a class="mb-0 link link-secondary link-underline-opacity-0 h5" href="/presentations/">View All <i class="bi bi-arrow-right-circle"></i></a>
<h2 class="section-title d-inline-block mb-0">
<i class="bi bi-easel"></i> Presentations
</h2>
<a
class="mb-0 link link-secondary link-underline-opacity-0 h5"
href="/presentations/"
>View All <i class="bi bi-arrow-right-circle"></i
></a>
</div>
<div class="list-group list-group-flush" id="presentation-list">
<div class="list-group-item px-0">
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Guest lecture<span class='text-muted'> | </span>Aalborg University
@ -606,9 +552,7 @@
</div>
</div>
<h5 class="mb-1 paper-title">Self-supervised Learning of Trajectory Data</h5>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Workshop presentation<span class='text-muted'> | </span>KDD 2024
@ -622,9 +566,7 @@
</div>
</div>
<h5 class="mb-1 paper-title">PLM4Traj: Leveraging Pre-trained Language Models for Cognizing Movement Patterns and Travel Purposes from Trajectories</h5>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Paper Oral<span class='text-muted'> | </span>SIGMOD 2024
@ -636,9 +578,7 @@
</div>
</div>
<h5 class="mb-1 paper-title">Origin-Destination Travel Time Oracle for Map-based Services</h5>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Tutorial<span class='text-muted'> | </span>SpatialDI 2024
@ -650,9 +590,7 @@
</div>
</div>
<h5 class="mb-1 paper-title">Self-supervised Learning of Spatial-temporal Trajectories</h5>
</div>
<div class="list-group-item px-0">
</div> <div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Paper Oral<span class='text-muted'> | </span>AAAI 2021
@ -664,51 +602,33 @@
</div>
</div>
<h5 class="mb-1 paper-title">Pre-training Context and Time Aware Location Embeddings from Spatial-Temporal Trajectories for User Next Location Prediction</h5>
</div>
</div>
</div>
</article>
</article>
<article id="services" class="rounded text-body-emphasis bg-body-secondary flex-md-row my-4 position-relative p-4 transition-shadow" style="transition: box-shadow 0.2s ease-in-out;" onmouseover="this.classList.add('shadow-sm')" onmouseout="this.classList.remove('shadow-sm')">
<article
id="services"
class="rounded text-body-emphasis bg-body-secondary flex-md-row my-4 position-relative p-4 transition-shadow"
style="transition: box-shadow 0.2s ease-in-out"
onmouseover="this.classList.add('shadow-sm')"
onmouseout="this.classList.remove('shadow-sm')"
>
<h2 class="mb-3"><i class="bi bi-person-lines-fill"></i> Services</h2>
<div id="service-list">
<ul class="list ps-3">
<li>IEEE, ACM member</li>
<li>Secretary of IEEE (Denmark Section) Computer Society</li>
<li>Reviewer for journals including TIST, TII, and TVT</li>
<li>Member of program committees of KDD, ICLR, NeurIPS, AAAI, CVPR, ICCV, IJCAI, and WWW</li>
</ul>
<ul class="list ps-3">
<li>IEEE, ACM member</li>
<li>Secretary of IEEE (Denmark Section) Computer Society</li>
<li>Reviewer for journals including TIST, TII, and TVT</li>
<li>Member of program committees of KDD, ICLR, NeurIPS, AAAI, CVPR, ICCV, IJCAI, and WWW</li>
</ul>
</div>
</article>
</article>
<article class="section" id="blog">
<div class="d-flex justify-content-between align-items-center mb-1">
<h2 class="section-title d-inline-block mb-0"><i class="bi bi-newspaper"></i> Blog</h2>
<a class="mb-0 link link-secondary link-underline-opacity-0 h5" href="/blog/">View All <i class="bi bi-arrow-right-circle"></i></a>
</div>
<div class="list-group list-group-flush" id="blog-list">
<div class="list-group-item px-0">
<a class="mb-1 paper-title blog-link text-decoration-none" href="/blog/html/one-step-diffusion-models.html">
One Step Diffusion Models <i class="bi bi-arrow-right-circle"></i>
</a> <span class="paper-title text-muted ms-2">May 2025</span>
<p class="card-text mb-auto tldr">Despite the promising performance of diffusion models on continuous modality generation, one deficiency that is holding them back is their requirement for multi-step denoising processes, which can be computationally expensive. In this article, we examine recent works that aim to build diffusion models capable of performing sampling in one or a few steps.</p>
</div>
<div class="list-group-item px-0">
<a class="mb-1 paper-title blog-link text-decoration-none" href="/blog/html/multi-modal-transformer.html">
Multi-modal and Multi-function Transformers <i class="bi bi-arrow-right-circle"></i>
</a> <span class="paper-title text-muted ms-2">April 2025</span>
<p class="card-text mb-auto tldr">Multi-modal and multi-function Transformers enables a single architecture to process diverse data types such as language, images, and videos simultaneously. These models employ techniques like vector quantization and lookup-free quantization to map different modalities into a unified embedding space, allowing the Transformer to handle them within the same sequence. Beyond processing multiple data types, these architectures can also combine different functionalities-such as auto-regressive language generation and diffusion-based image creation-within a single model.</p>
</div>
</div>
</article>
</main>
@ -720,15 +640,14 @@
<span class="mx-1">|</span>
Designed and implemented by Yan Lin.
<span class="mx-1">|</span>
<a class="link link-secondary" target="_blank" href="https://git.yanlincs.com/yanlin/Homepage">Source Code</a>
<a class="link link-secondary" target="_blank" href="https://github.com/Logan-Lin/Homepage">Source Code</a>
</p>
</div>
</footer>
<button id="back-to-top" class="btn btn-light rounded-circle" style="position: fixed; bottom: 20px; right: 20px; display: none; z-index: 1000; width: 40px; height: 40px; padding: 0;"><i class="bi bi-chevron-up"></i></button>
<script>
// Show or hide the back-to-top button
window.addEventListener('scroll', function() {
@ -751,8 +670,7 @@
return false;
});
</script>
</body>
</html>

View file

@ -127,7 +127,7 @@
<span class="mx-1">|</span>
Designed and implemented by Yan Lin.
<span class="mx-1">|</span>
<a class="link link-secondary" target="_blank" href="https://git.yanlincs.com/yanlin/Homepage">Source Code</a>
<a class="link link-secondary" target="_blank" href="https://github.com/Logan-Lin/Homepage">Source Code</a>
</p>
</div>
</footer>

View file

@ -36,24 +36,41 @@
</div>
<!-- <nav class="navbar navbar-expand">
<ul class="navbar-nav d-flex justify-content-evenly mx-auto gap-5">
<li class="nav-item">
<a class="link nav-link px-0" href="/#publications"><i class="bi bi-book d-inline d-md-none d-lg-inline"></i> <span class="d-none d-md-inline">Publications</span></a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#projects"><i class="bi bi-code-slash d-inline d-md-none d-lg-inline"></i> <span class="d-none d-md-inline">Projects</span></a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#presentations"><i class="bi bi-easel d-inline d-md-none d-lg-inline"></i> <span class="d-none d-md-inline">Presentations</span></a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#services"><i class="bi bi-person-lines-fill d-inline d-md-none d-lg-inline"></i> <span class="d-none d-md-inline">Services</span></a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#blog"><i class="bi bi-newspaper d-inline d-md-none d-lg-inline"></i> <span class="d-none d-md-inline">Blog</span></a>
</li>
</ul>
</nav> -->
<ul class="navbar-nav d-flex justify-content-evenly mx-auto gap-5">
<li class="nav-item">
<a class="link nav-link px-0" href="/#publications"
><i class="bi bi-book d-inline d-md-none d-lg-inline"></i>
<span class="d-none d-md-inline">Publications</span></a
>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#projects"
><i class="bi bi-code-slash d-inline d-md-none d-lg-inline"></i>
<span class="d-none d-md-inline">Projects</span></a
>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#presentations"
><i class="bi bi-easel d-inline d-md-none d-lg-inline"></i>
<span class="d-none d-md-inline">Presentations</span></a
>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#services"
><i
class="bi bi-person-lines-fill d-inline d-md-none d-lg-inline"
></i>
<span class="d-none d-md-inline">Services</span></a
>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="https://blog.yanlincs.com"
><i class="bi bi-newspaper d-inline d-md-none d-lg-inline"></i>
<span class="d-none d-md-inline">Blog</span></a
>
</li>
</ul>
</nav> -->
</header>
@ -178,7 +195,7 @@
<span class="mx-1">|</span>
Designed and implemented by Yan Lin.
<span class="mx-1">|</span>
<a class="link link-secondary" target="_blank" href="https://git.yanlincs.com/yanlin/Homepage">Source Code</a>
<a class="link link-secondary" target="_blank" href="https://github.com/Logan-Lin/Homepage">Source Code</a>
</p>
</div>
</footer>

View file

@ -36,24 +36,41 @@
</div>
<!-- <nav class="navbar navbar-expand">
<ul class="navbar-nav d-flex justify-content-evenly mx-auto gap-5">
<li class="nav-item">
<a class="link nav-link px-0" href="/#publications"><i class="bi bi-book d-inline d-md-none d-lg-inline"></i> <span class="d-none d-md-inline">Publications</span></a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#projects"><i class="bi bi-code-slash d-inline d-md-none d-lg-inline"></i> <span class="d-none d-md-inline">Projects</span></a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#presentations"><i class="bi bi-easel d-inline d-md-none d-lg-inline"></i> <span class="d-none d-md-inline">Presentations</span></a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#services"><i class="bi bi-person-lines-fill d-inline d-md-none d-lg-inline"></i> <span class="d-none d-md-inline">Services</span></a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#blog"><i class="bi bi-newspaper d-inline d-md-none d-lg-inline"></i> <span class="d-none d-md-inline">Blog</span></a>
</li>
</ul>
</nav> -->
<ul class="navbar-nav d-flex justify-content-evenly mx-auto gap-5">
<li class="nav-item">
<a class="link nav-link px-0" href="/#publications"
><i class="bi bi-book d-inline d-md-none d-lg-inline"></i>
<span class="d-none d-md-inline">Publications</span></a
>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#projects"
><i class="bi bi-code-slash d-inline d-md-none d-lg-inline"></i>
<span class="d-none d-md-inline">Projects</span></a
>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#presentations"
><i class="bi bi-easel d-inline d-md-none d-lg-inline"></i>
<span class="d-none d-md-inline">Presentations</span></a
>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#services"
><i
class="bi bi-person-lines-fill d-inline d-md-none d-lg-inline"
></i>
<span class="d-none d-md-inline">Services</span></a
>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="https://blog.yanlincs.com"
><i class="bi bi-newspaper d-inline d-md-none d-lg-inline"></i>
<span class="d-none d-md-inline">Blog</span></a
>
</li>
</ul>
</nav> -->
</header>
@ -70,6 +87,8 @@
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://ieeexplore.ieee.org/document/11004614" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2402.07232" target="_blank" rel="noopener noreferrer">Preprint</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/Logan-Lin/UVTM" target="_blank" rel="noopener noreferrer">Code</a>
@ -464,7 +483,7 @@
<span class="mx-1">|</span>
Designed and implemented by Yan Lin.
<span class="mx-1">|</span>
<a class="link link-secondary" target="_blank" href="https://git.yanlincs.com/yanlin/Homepage">Source Code</a>
<a class="link link-secondary" target="_blank" href="https://github.com/Logan-Lin/Homepage">Source Code</a>
</p>
</div>
</footer>

View file

@ -6,28 +6,25 @@ from jinja2 import Environment, FileSystemLoader
if __name__ == '__main__':
with open('data.yaml', 'r') as file:
profile_data = yaml.safe_load(file)
env = Environment(loader=FileSystemLoader('templates'))
os.makedirs('dist', exist_ok=True)
os.makedirs('dist/publications', exist_ok=True)
os.makedirs('dist/projects', exist_ok=True)
os.makedirs('dist/presentations', exist_ok=True)
os.makedirs('dist/blog', exist_ok=True)
os.makedirs('dist/blog/html', exist_ok=True)
def render_template(template_name, output_path, **kwargs):
template = env.get_template(template_name)
html = template.render(**kwargs)
with open(output_path, 'w') as file:
file.write(html)
print(f'Generated {output_path}')
render_template('index.html', 'dist/index.html', data=profile_data, is_home_page=True)
render_template('publications.html', 'dist/publications/index.html', data=profile_data, is_home_page=False)
render_template('projects.html', 'dist/projects/index.html', data=profile_data, is_home_page=False)
render_template('presentations.html', 'dist/presentations/index.html', data=profile_data, is_home_page=False)
render_template('blog.html', 'dist/blog/index.html', data=profile_data, is_home_page=False)
print('Static site generation complete!')
print('Static site generation complete!')

View file

@ -1,168 +0,0 @@
import markdown
import re
import os
import glob
from typing import List
def markdown_to_html_paragraphs(markdown_text: str) -> List[str]:
"""
Convert markdown text into a list of HTML paragraphs.
Supports mathematical equations using LaTeX syntax.
Args:
markdown_text (str): The markdown text to convert
Returns:
List[str]: A list of HTML paragraphs, each wrapped in <p> tags
"""
# Prepend "md/" to image paths if they don't already start with md/
markdown_text = re.sub(r'!\[(.*?)\]\((?!md/)([^/].*?\.assets/.*?)\)', r'![\1](/blog/md/\2)', markdown_text)
# Check if the first line starts with a # for h1 title
lines = markdown_text.split('\n')
has_h1_title = False
bold_title = None
if lines and lines[0].strip().startswith('#'):
has_h1_title = True
title_line = lines[0].strip().lstrip('#').strip()
bold_title = f'<p class="blog-title">{title_line}</p>'
# Remove the title from the markdown to avoid duplicate processing
markdown_text = '\n'.join(lines[1:])
else:
raise ValueError("No title found in the markdown file")
# Configure markdown with math extensions
extensions = [
'markdown.extensions.extra', # For blockquotes and other features
'markdown.extensions.fenced_code', # For code blocks
'markdown.extensions.codehilite', # For syntax highlighting
'markdown.extensions.attr_list', # For attributes
'markdown.extensions.md_in_html', # For markdown inside HTML
'mdx_math', # For math support
]
try:
# Try to use python-markdown-math which outputs compatible with MathJax 3
import pymdownx.arithmatex
extensions.remove('mdx_math')
extensions.append('pymdownx.arithmatex')
extension_configs = {
'pymdownx.arithmatex': {
'generic': True # Uses \(...\) for inline and \[...\] for display math
}
}
except ImportError:
# Fallback to mdx_math
extension_configs = {
'mdx_math': {
'enable_dollar_delimiter': True, # Enable $...$ for inline math
}
}
# Convert markdown to HTML with math support
html = markdown.markdown(
markdown_text,
extensions=extensions,
extension_configs=extension_configs
)
html = re.sub(r'<p>\s*(<img[^>]+>)\s*</p>', r'\1', html, flags=re.IGNORECASE)
# Convert image followed by blockquote to figure with caption
html = re.sub(
r'<img([^>]+)>\s*<blockquote>\s*<p>(.*?)</p>\s*</blockquote>',
r'<figure class="figure">\n <img\1 class="figure-img img-fluid rounded">\n <figcaption class="figure-caption">\2</figcaption>\n</figure>',
html,
flags=re.DOTALL
)
# Add "link" class and target="_blank" to all <a> tags
html = re.sub(r'<a(.*?)>', r'<a\1 class="link" target="_blank">', html)
html = re.sub(r'<a(.*?)class="(.*?)"(.*?)class="(.*?)"(.*?)>', r'<a\1class="\2 \4"\3\5>', html)
html = re.sub(r'<a(.*?)target="(.*?)"(.*?)target="(.*?)"(.*?)>', r'<a\1target="\2"\3\5>', html)
# Split the HTML into paragraphs
paragraphs = html.split('\n\n')
# Clean up and ensure each paragraph is properly wrapped
cleaned_paragraphs = []
# Add the bold title as the first element if it exists
if has_h1_title and bold_title:
cleaned_paragraphs.append(bold_title)
for p in paragraphs:
p = p.strip()
if p:
# If the paragraph doesn't already have <p> tags, add them
if not (p.startswith('<') and not p.startswith('<p>')):
p = f'<p>{p}</p>'
cleaned_paragraphs.append(p)
return cleaned_paragraphs, title_line
def insert_markdown_into_template(template_path: str, markdown_text: str) -> str:
"""
Insert parsed markdown content into the template HTML file.
Args:
template_path (str): Path to the template HTML file
markdown_text (str): The markdown text to convert and insert
Returns:
str: Complete HTML with markdown content inserted
"""
# Parse markdown into HTML paragraphs
html_paragraphs, title_line = markdown_to_html_paragraphs(markdown_text)
# Read the template
with open(template_path, 'r') as f:
template = f.read()
# Join paragraphs into a single string
content_html = '\n'.join(html_paragraphs)
# Insert the content into the template
complete_html = template.replace('{{ content }}', content_html)
# Replace {{ title }} placeholders with the extracted title
complete_html = complete_html.replace('{{ title }}', title_line)
return complete_html
def process_all_markdown_files():
"""
Process all markdown files in blog/md/ directory and generate HTML files in blog/html/.
"""
# Get all markdown files in blog/md/
md_files = glob.glob("dist/blog/md/*.md")
template_path = "dist/blog/template.html"
for md_file in md_files:
# Extract base filename without extension
base_name = os.path.basename(md_file)[:-3] # Remove .md extension
html_file = f"dist/blog/html/{base_name}.html"
print(f"Processing {md_file} -> {html_file}")
try:
# Read the markdown content
with open(md_file, "r") as f:
markdown_text = f.read()
# Generate HTML content
complete_html = insert_markdown_into_template(template_path, markdown_text)
# Write HTML output
with open(html_file, "w") as f:
f.write(complete_html)
except Exception as e:
print(f"Error processing {md_file}: {str(e)}")
if __name__ == "__main__":
process_all_markdown_files()

View file

@ -13,16 +13,15 @@ pkgs.mkShell {
in ''
export PIP_REQUIRE_VIRTUALENV=1
export VENV_PATH=${venvPath}
if [ ! -d $VENV_PATH ]; then
python -m venv $VENV_PATH
fi
source $VENV_PATH/bin/activate
pip install -r requirements.txt
python parser/md.py
python generate.py
${if isDev then ''
pip install watchdog==6.0.0
python watch.py && exit

View file

@ -54,7 +54,7 @@
<span class="mx-1">|</span>
Designed and implemented by Yan Lin.
<span class="mx-1">|</span>
<a class="link link-secondary" target="_blank" href="https://git.yanlincs.com/yanlin/Homepage">Source Code</a>
<a class="link link-secondary" target="_blank" href="https://github.com/Logan-Lin/Homepage">Source Code</a>
</p>
</div>
</footer>

View file

@ -1,18 +0,0 @@
{% extends 'base.html' %}
{% block title %}Yan Lin's Blog{% endblock %}
{% block header_title %}Yan Lin's Blog{% endblock %}
{% block navigation %}
{% endblock %}
{% block content %}
<article class="section mt-4">
<div class="list-group list-group-flush">
{% for blog in data.blogs %}
{% include 'partials/blog.html' %}
{% endfor %}
</div>
</article>
{% endblock %}

View file

@ -1,113 +1,122 @@
{% extends 'base.html' %}
{% block title %}Yan Lin's Homepage{% endblock %}
{% block navigation %}
{% include 'partials/navigation.html' %}
{% endblock %}
{% block content %}
<div class="row g-0 border rounded text-body-emphasis bg-body-secondary flex-md-row my-4 position-relative shadow-sm transition-shadow" style="transition: box-shadow 0.2s ease-in-out;" onmouseover="this.classList.remove('shadow-sm'); this.classList.add('shadow')" onmouseout="this.classList.remove('shadow'); this.classList.add('shadow-sm')">
{% extends 'base.html' %} {% block title %}Yan Lin's Homepage{% endblock %} {%
block navigation %} {% include 'partials/navigation.html' %} {% endblock %} {%
block content %}
<div
class="row g-0 border rounded text-body-emphasis bg-body-secondary flex-md-row my-4 position-relative shadow-sm transition-shadow"
style="transition: box-shadow 0.2s ease-in-out"
onmouseover="this.classList.remove('shadow-sm'); this.classList.add('shadow')"
onmouseout="this.classList.remove('shadow'); this.classList.add('shadow-sm')"
>
<div class="col p-4 d-flex flex-column position-static">
<h2 class="fst-italic mb-3">Biography - Yan Lin</h2>
<p class="card-text mb-auto" style="font-size: 1.1rem;">
I am currently a postdoctoral researcher in the Department of Computer Science at Aalborg University.
I received my PhD and Bachelor's degrees from Beijing Jiaotong University, China.
My research interests include <i>spatiotemporal data mining</i>, <i>representation learning</i>, and <i>AI for science</i>.
</p>
<h2 class="fst-italic mb-3">Biography - Yan Lin</h2>
<p class="card-text mb-auto" style="font-size: 1.1rem">
I am currently a postdoctoral researcher in the Department of
Computer Science at Aalborg University. I received my PhD and
Bachelor's degrees from Beijing Jiaotong University, China. My
research interests include <i>spatiotemporal data mining</i>,
<i>representation learning</i>, and <i>AI for science</i>.
</p>
</div>
<div class="col-5 col-xl-4 col-xxl-3 d-none d-lg-block d-flex align-items-center">
<img src="/profile.webp" alt="Yan Lin" class="rounded w-100" style="object-fit: contain;">
<div
class="col-5 col-xl-4 col-xxl-3 d-none d-lg-block d-flex align-items-center"
>
<img
src="/profile.webp"
alt="Yan Lin"
class="rounded w-100"
style="object-fit: contain"
/>
</div>
</div>
</div>
<article class="section" id="publications">
<article class="section" id="publications">
<div class="d-flex justify-content-between align-items-center mb-1">
<h2 class="section-title d-inline-block mb-0"><i class="bi bi-book"></i> Publications</h2>
<a class="mb-0 link link-secondary link-underline-opacity-0 h5" href="/publications/">View All <i class="bi bi-arrow-right-circle"></i></a>
<h2 class="section-title d-inline-block mb-0">
<i class="bi bi-book"></i> Publications
</h2>
<a
class="mb-0 link link-secondary link-underline-opacity-0 h5"
href="/publications/"
>View All <i class="bi bi-arrow-right-circle"></i
></a>
</div>
<div>
<div id="primary-publications" class="list-group list-group-flush">
{% for pub in data.primaryPublications[:10] %}
{% with type='primary' %}
{% include 'partials/publication.html' %}
{% endwith %}
{% endfor %}
</div>
<hr class="my-2">
<div id="secondary-publications" class="list-group list-group-flush">
{% for pub in data.secondaryPublications[:10] %}
{% with type='secondary' %}
{% include 'partials/publication.html' %}
{% endwith %}
{% endfor %}
</div>
<div id="primary-publications" class="list-group list-group-flush">
{% for pub in data.primaryPublications[:10] %} {% with
type='primary' %} {% include 'partials/publication.html' %} {%
endwith %} {% endfor %}
</div>
<hr class="my-2" />
<div id="secondary-publications" class="list-group list-group-flush">
{% for pub in data.secondaryPublications[:10] %} {% with
type='secondary' %} {% include 'partials/publication.html' %} {%
endwith %} {% endfor %}
</div>
</div>
<div class="text-start mt-1">
<small class="text-muted" style="font-size: 0.8rem;">* Equal Contribution</small>
<small class="text-muted" style="font-size: 0.8rem"
>* Equal Contribution</small
>
</div>
</article>
</article>
<article class="section" id="projects">
<article class="section" id="projects">
<div class="d-flex justify-content-between align-items-center mb-1">
<h2 class="section-title d-inline-block mb-0"><i class="bi bi-code-slash"></i> Projects</h2>
<a class="mb-0 link link-secondary link-underline-opacity-0 h5" href="/projects/">View All <i class="bi bi-arrow-right-circle"></i></a>
<h2 class="section-title d-inline-block mb-0">
<i class="bi bi-code-slash"></i> Projects
</h2>
<a
class="mb-0 link link-secondary link-underline-opacity-0 h5"
href="/projects/"
>View All <i class="bi bi-arrow-right-circle"></i
></a>
</div>
<div>
<div id="primary-projects" class="list-group list-group-flush">
{% for project in data.primaryProjects[:3] %}
{% with type='primary' %}
{% include 'partials/project.html' %}
{% endwith %}
{% endfor %}
</div>
<hr class="my-2">
<div id="secondary-projects" class="list-group list-group-flush">
{% for project in data.secondaryProjects[:3] %}
{% with type='secondary' %}
{% include 'partials/project.html' %}
{% endwith %}
{% endfor %}
</div>
<div id="primary-projects" class="list-group list-group-flush">
{% for project in data.primaryProjects[:3] %} {% with type='primary'
%} {% include 'partials/project.html' %} {% endwith %} {% endfor %}
</div>
<hr class="my-2" />
<div id="secondary-projects" class="list-group list-group-flush">
{% for project in data.secondaryProjects[:3] %} {% with
type='secondary' %} {% include 'partials/project.html' %} {% endwith
%} {% endfor %}
</div>
</div>
</article>
</article>
<article class="section" id="presentations">
<article class="section" id="presentations">
<div class="d-flex justify-content-between align-items-center mb-1">
<h2 class="section-title d-inline-block mb-0"><i class="bi bi-easel"></i> Presentations</h2>
<a class="mb-0 link link-secondary link-underline-opacity-0 h5" href="/presentations/">View All <i class="bi bi-arrow-right-circle"></i></a>
<h2 class="section-title d-inline-block mb-0">
<i class="bi bi-easel"></i> Presentations
</h2>
<a
class="mb-0 link link-secondary link-underline-opacity-0 h5"
href="/presentations/"
>View All <i class="bi bi-arrow-right-circle"></i
></a>
</div>
<div class="list-group list-group-flush" id="presentation-list">
{% for presentation in data.presentations[:5] %}
{% include 'partials/presentation.html' %}
{% endfor %}
{% for presentation in data.presentations[:5] %} {% include
'partials/presentation.html' %} {% endfor %}
</div>
</article>
</article>
<article id="services" class="rounded text-body-emphasis bg-body-secondary flex-md-row my-4 position-relative p-4 transition-shadow" style="transition: box-shadow 0.2s ease-in-out;" onmouseover="this.classList.add('shadow-sm')" onmouseout="this.classList.remove('shadow-sm')">
<article
id="services"
class="rounded text-body-emphasis bg-body-secondary flex-md-row my-4 position-relative p-4 transition-shadow"
style="transition: box-shadow 0.2s ease-in-out"
onmouseover="this.classList.add('shadow-sm')"
onmouseout="this.classList.remove('shadow-sm')"
>
<h2 class="mb-3"><i class="bi bi-person-lines-fill"></i> Services</h2>
<div id="service-list">
<ul class="list ps-3">
{% for service in data.services %}
<li>{{ service }}</li>
{% endfor %}
</ul>
<ul class="list ps-3">
{% for service in data.services %}
<li>{{ service }}</li>
{% endfor %}
</ul>
</div>
</article>
</article>
<article class="section" id="blog">
<div class="d-flex justify-content-between align-items-center mb-1">
<h2 class="section-title d-inline-block mb-0"><i class="bi bi-newspaper"></i> Blog</h2>
<a class="mb-0 link link-secondary link-underline-opacity-0 h5" href="/blog/">View All <i class="bi bi-arrow-right-circle"></i></a>
</div>
<div class="list-group list-group-flush" id="blog-list">
{% for blog in data.blogs[:3] %}
{% include 'partials/blog.html' %}
{% endfor %}
</div>
</article>
{% endblock %}
{% block extra_js %}
{{ super() }}
{% endblock %}
{% endblock %} {% block extra_js %} {{ super() }} {% endblock %}

View file

@ -1,6 +0,0 @@
<div class="list-group-item px-0">
<a class="mb-1 paper-title blog-link text-decoration-none" href="/blog/html/{{ blog.path }}.html">
{{ blog.title }} <i class="bi bi-arrow-right-circle"></i>
</a> <span class="paper-title text-muted ms-2">{{ blog.badge }}</span>
<p class="card-text mb-auto tldr">{{ blog.tldr }}</p>
</div>

View file

@ -1,19 +1,36 @@
<nav class="navbar navbar-expand">
<ul class="navbar-nav d-flex justify-content-evenly mx-auto gap-5">
<li class="nav-item">
<a class="link nav-link px-0" href="/#publications"><i class="bi bi-book d-inline d-md-none d-lg-inline"></i> <span class="d-none d-md-inline">Publications</span></a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#projects"><i class="bi bi-code-slash d-inline d-md-none d-lg-inline"></i> <span class="d-none d-md-inline">Projects</span></a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#presentations"><i class="bi bi-easel d-inline d-md-none d-lg-inline"></i> <span class="d-none d-md-inline">Presentations</span></a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#services"><i class="bi bi-person-lines-fill d-inline d-md-none d-lg-inline"></i> <span class="d-none d-md-inline">Services</span></a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#blog"><i class="bi bi-newspaper d-inline d-md-none d-lg-inline"></i> <span class="d-none d-md-inline">Blog</span></a>
</li>
</ul>
</nav>
<ul class="navbar-nav d-flex justify-content-evenly mx-auto gap-5">
<li class="nav-item">
<a class="link nav-link px-0" href="/#publications"
><i class="bi bi-book d-inline d-md-none d-lg-inline"></i>
<span class="d-none d-md-inline">Publications</span></a
>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#projects"
><i class="bi bi-code-slash d-inline d-md-none d-lg-inline"></i>
<span class="d-none d-md-inline">Projects</span></a
>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#presentations"
><i class="bi bi-easel d-inline d-md-none d-lg-inline"></i>
<span class="d-none d-md-inline">Presentations</span></a
>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#services"
><i
class="bi bi-person-lines-fill d-inline d-md-none d-lg-inline"
></i>
<span class="d-none d-md-inline">Services</span></a
>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="https://blog.yanlincs.com"
><i class="bi bi-newspaper d-inline d-md-none d-lg-inline"></i>
<span class="d-none d-md-inline">Blog</span></a
>
</li>
</ul>
</nav>

View file

@ -13,7 +13,7 @@ class ChangeHandler(FileSystemEventHandler):
if any(event.src_path.endswith(ext) for ext in ['.md', '.py', '.html', '.css', '.js', '.yaml']):
print(f"File {event.src_path} has been modified")
self.regenerate()
def on_created(self, event):
if event.is_directory:
return
@ -22,10 +22,9 @@ class ChangeHandler(FileSystemEventHandler):
if any(event.src_path.endswith(ext) for ext in ['.md', '.py', '.html', '.css', '.js', '.yaml']):
print(f"File {event.src_path} has been created")
self.regenerate()
def regenerate(self):
print("Regenerating content...")
subprocess.run(["python", "parser/md.py"])
subprocess.run(["python", "generate.py"])
print("Content regenerated")
@ -34,9 +33,9 @@ if __name__ == "__main__":
observer = Observer()
observer.schedule(event_handler, ".", recursive=True)
observer.start()
http_server = subprocess.Popen(["python", "-m", "http.server", "8000", "--directory", "dist"])
try:
print("Watching for file changes... (Press Ctrl+C to stop)")
while True:
@ -44,4 +43,4 @@ if __name__ == "__main__":
except KeyboardInterrupt:
observer.stop()
http_server.terminate()
observer.join()
observer.join()