commit ed9566d0572efda9bf5fd00d931205ccdbf09272 Author: Yan Lin Date: Tue May 13 10:48:42 2025 +0200 First commit diff --git a/data.yaml b/data.yaml new file mode 100644 index 0000000..0bf271e --- /dev/null +++ b/data.yaml @@ -0,0 +1,277 @@ +primaryPublications: + - title: "UVTM: Universal Vehicle Trajectory Modeling with ST Feature Domain Generation" + authors: "Yan Lin, Jilin Hu, Shengnan Guo, Bin Yang, Christian S. Jensen, Youfang Lin, Huaiyu Wan" + tags: + - "IEEE TKDE" + - "2025" + links: + Preprint: "https://arxiv.org/abs/2402.07232" + Code: "https://github.com/Logan-Lin/UVTM" + + - title: "TrajCogn: Leveraging LLMs for Cognizing Movement Patterns and Travel Purposes from Trajectories" + authors: "Zeyu Zhou*, Yan Lin*, Haomin Wen, Shengnan Guo, Jilin Hu, Youfang Lin, Huaiyu Wan" + tags: + - "IJCAI" + - "2025" + links: + Preprint: "https://arxiv.org/abs/2405.12459" + Code: "https://github.com/Zeru19/PLM4Traj" + + - title: "UniTE: A Survey and Unified Pipeline for Pre-training Spatiotemporal Trajectory Embeddings" + authors: "Yan Lin, Zeyu Zhou, Yicheng Liu, Haochen Lv, Haomin Wen, Tianyi Li, Yushuai Li, Christian S. Jensen, Shengnan Guo, Youfang Lin, Huaiyu Wan" + tags: + - "IEEE TKDE" + - "2025" + links: + Paper: "https://ieeexplore.ieee.org/document/10818577" + Preprint: "https://arxiv.org/abs/2407.12550" + Code: "https://github.com/Logan-Lin/UniTE" + + - title: "Path-LLM: A Multi-Modal Path Representation Learning by Aligning and Fusing with Large Language Models" + authors: "Yongfu Wei*, Yan Lin*, Hongfan Gao, Ronghui Xu, Sean Bin Yang, Jilin Hu" + tags: + - "WWW" + - "2025" + links: + Paper: "https://openreview.net/forum?id=KmMSQS6tFn" + Code: "https://github.com/decisionintelligence/Path-LLM" + + - title: "DutyTTE: Deciphering Uncertainty in Origin-Destination Travel Time Estimation" + authors: "Xiaowei Mao*, Yan Lin*, Shengnan Guo, Yubin Chen, Xingyu Xian, Haomin Wen, Qisen Xu, Youfang Lin, Huaiyu Wan" + tags: + - "AAAI" + - "2025" + links: + Preprint: "https://arxiv.org/abs/2408.12809" + + - title: "Mobility-LLM: Learning Visiting Intentions and Travel Preference from Human Mobility Data with Large Language Models" + authors: "Letian Gong*, Yan Lin*, Xinyue Zhang, Yiwen Lu, Xuedi Han, Yichen Liu, Shengnan Guo, Youfang Lin, Huaiyu Wan" + tags: + - "NeurIPS" + - "2024" + links: + Paper: "https://openreview.net/forum?id=0feJEykDRx" + Poster: "https://neurips.cc/virtual/2024/poster/96914" + + - title: "Origin-Destination Travel Time Oracle for Map-based Services" + authors: "Yan Lin, Huaiyu Wan, Jilin Hu, Shengnan Guo, Bin Yang, Christian S. Jensen, Youfang Lin" + tags: + - "SIGMOD" + - "2024" + links: + Paper: "https://dl.acm.org/doi/10.1145/3617337" + Preprint: "https://arxiv.org/abs/2307.03048" + Code: "https://github.com/Logan-Lin/DOT" + + - title: "Pre-training General Trajectory Embeddings with Maximum Multi-view Entropy Coding" + authors: "Yan Lin, Huaiyu Wan, Shengnan Guo, Jilin Hu, Christian S. Jensen, Youfang Lin" + tags: + - "IEEE TKDE" + - "2023" + links: + Paper: "https://ieeexplore.ieee.org/abstract/document/10375102" + Preprint: "https://arxiv.org/abs/2207.14539" + Code: "https://github.com/Logan-Lin/MMTEC" + + - title: "Pre-training Time-aware location embeddings from spatial-temporal trajectories" + authors: "Huaiyu Wan, Yan Lin, Shengnan Guo, Youfang Lin" + tags: + - "IEEE TKDE" + - "2022" + links: + Paper: "https://ieeexplore.ieee.org/abstract/document/9351627" + Code: "https://github.com/Logan-Lin/TALE" + + - title: "Pre-training Context and Time Aware Location Embeddings from Spatial-Temporal Trajectories for User Next Location Prediction" + authors: "Yan Lin, Huaiyu Wan, Shengnan Guo, Youfang Lin" + tags: + - "AAAI" + - "2021" + links: + Paper: "https://ojs.aaai.org/index.php/AAAI/article/view/16548" + Code: "https://github.com/Logan-Lin/CTLE" + +secondaryPublications: + - title: "DUET: Dual Clustering Enhanced Multivariate Time Series Forecasting" + authors: "Xiangfei Qiu, Xingjian Wu, Yan Lin, Chenjuan Guo, Jilin Hu, Bin Yang" + tags: + - "KDD" + - "2025" + links: + Preprint: "https://arxiv.org/abs/2412.10859" + Code: "https://github.com/decisionintelligence/DUET" + + - title: "Diff-RNTraj: A Structure-aware Diffusion Model for Road Network-constrained Trajectory Generation" + authors: "Tonglong Wei, Youfang Lin, Shengnan Guo, Yan Lin, Yiheng Huang, Chenyang Xiang, Yuqing Bai, Menglu Ya, Huaiyu Wan" + tags: + - "IEEE TKDE" + - "2024" + links: + Paper: "https://www.computer.org/csdl/journal/tk/5555/01/10679607/20b3hlbjBOo" + Preprint: "https://arxiv.org/abs/2402.07369" + Code: "https://github.com/wtl52656/Diff-RNTraj" + + - title: "STCDM: Spatio-Temporal Contrastive Diffusion Model for Check-In Sequence Generation" + authors: "Letian Gong, Shengnan Guo, Yan Lin, Yichen Liu, Erwen Zheng, Yiwei Shuang, Youfang Lin, Jilin Hu, Huaiyu Wan" + tags: + - "IEEE TKDE" + - "2024" + links: + Paper: "https://ieeexplore.ieee.org/document/10836764" + + - title: "Micro-Macro Spatial-Temporal Graph-based Encoder-Decoder for Map-Constrained Trajectory Recovery" + authors: "Tonglong Wei, Youfang Lin, Yan Lin, Shengnan Guo, Lan Zhang, Huaiyu Wan" + tags: + - "IEEE TKDE" + - "2024" + links: + Paper: "https://www.computer.org/csdl/journal/tk/5555/01/10517676/1WCj0j0FljW" + Preprint: "https://arxiv.org/abs/2404.19141" + Code: "https://github.com/wtl52656/MM-STGED" + + - title: "Inductive and Adaptive Graph Convolution Networks Equipped with Constraint Task for Spatial-Temporal Traffic Data Kriging" + authors: "Tonglong Wei, Youfang Lin, Shengnan Guo, Yan Lin, Yiji Zhao, Xiyuan Jin, Zhihao Wu, Huaiyu Wan" + tags: + - "KBS" + - "2024" + links: + Paper: "https://www.sciencedirect.com/science/article/pii/S0950705123010730" + Code: "https://github.com/wtl52656/IAGCN" + + - title: "Spatial-Temporal Cross-View Contrastive Pre-Training for Check-in Sequence Representation Learning" + authors: "Letian Gong, Huaiyu Wan, Shengnan Guo, Li Xiucheng, Yan Lin, Erwen Zheng, Tianyi Wang, Zeyu Zhou, Youfang Lin" + tags: + - "IEEE TKDE" + - "2024" + links: + Preprint: "https://arxiv.org/abs/2407.15899" + + - title: "Contrastive Pre-training with Adversarial Perturbations for Check-In Sequence Representation Learning" + authors: "Letian Gong, Youfang Lin, Shengnan Guo, Yan Lin, Tianyi Wang, Erwen Zheng, Zeyu Zhou, Huaiyu Wan" + tags: + - "AAAI" + - "2023" + links: + Paper: "https://ojs.aaai.org/index.php/AAAI/article/view/25546" + Code: "https://github.com/LetianGong/CACSR" + + - title: "Adversarial Self-Attentive Time-Variant Neural Networks for Multi-Step Time Series Forecasting" + authors: "Changxia Gao, Ning Zhang, Youru Li, Yan Lin, Huaiyu Wan" + tags: + - "ESWA" + - "2023" + links: + Paper: "https://www.sciencedirect.com/science/article/pii/S0957417423012241" + + - title: "Multi-scale Adaptive Attention-based Time-Variant Neural Networks for Multi-step Time Series Forecasting" + authors: "Changxia Gao, Ning Zhang, Youru Li, Yan Lin, Huaiyu Wan" + tags: + - "APIN" + - "2023" + links: + Paper: "https://link.springer.com/article/10.1007/s10489-023-05057-7" + + - title: "WITRAN: Water-wave Information Transmission and Recurrent Acceleration Network for Long-range Time Series Forecasting" + authors: "Yuxin Jia, Youfang Lin, Xinyan Hao, Yan Lin, Shengnan Guo, Huaiyu Wan" + tags: + - "NeurIPS" + - "2023" + links: + Paper: "https://openreview.net/forum?id=y08bkEtNBK" + Code: "https://github.com/Water2sea/WITRAN" + +primaryProjects: + - title: 'Research on Prediction of User Travel Destination and Travel Time Based on Trajectory Representation Learning' + tags: + - "Fundamental Research Funds for the Central Universities of China" + desc: "Applies representation learning to trajectory data to transform original features into high-level information, improving the performance of downstream tasks such as travel time and destination prediction." + links: {} + + - title: 'Development of OverleafCopilot - Empowering Academic Writing in Overleaf with Large Language Models' + tags: + - "Personal Interest Project" + desc: "This project aims to develop a Browser extension to seamlessly integrate Large Language Models (such as ChatGPT) into the popular online academic writing platform, Overleaf." + links: + Home: "https://www.overleafcopilot.com/" + Install: "https://chromewebstore.google.com/detail/overleaf-copilot/eoadabdpninlhkkbhngoddfjianhlghb" + + - title: 'Development of PromptGenius - All-purpose prompts for LLMs' + tags: + - "Personal Interest Project" + desc: "This project focuses on developing a website that offers a wide range of prompt categories, enhancing the versatility of LLMs for various tasks and improving their output quality." + links: + Website: "https://www.promptgenius.site/" + Code: "https://github.com/wenhaomin/ChatGPT-PromptGenius" + +secondaryProjects: + - title: 'Research on Inverse Design of Materials Using Diffusion Probabilistic Models' + tags: + - "Villum Foundation" + desc: "This project focuses on developing diffusion probabilistic models to first understand the relationship between chemistry/structure and material properties, then enable the inverse design of new materials with specific properties. This project currently supports my postdoctoral research position." + links: {} + + - title: 'Research on Pre-training Representation Learning Methods of Spatial-temporal Trajectory Data for Traffic Prediction' + tags: + - "National Natural Science Foundation of China" + desc: "This project aims to propose pre-training representation learning methods for spatial-temporal trajectory data, modeling multiple features to improve traffic prediction tasks. It demonstrates how trajectory representation learning can enhance traffic data mining." + links: {} + + - title: 'Research on Spatial-temporal Trajectory Generation and Representation Learning Methods for Sparsity Problems' + tags: + - "National Natural Science Foundation of China" + desc: "This project explores how to generate high-quality spatial-temporal trajectory data and corresponding representations to address sparsity-related issues, thereby supporting a variety of downstream tasks." + links: {} + +presentations: + - title: 'Self-supervised Learning of Trajectory Data' + tags: + - "Guest lecture" + - "Aalborg University" + links: + Slides: "/assets/Self-supervised Learning of Trajectory Data.pdf" + + - title: 'PLM4Traj: Leveraging Pre-trained Language Models for Cognizing Movement Patterns and Travel Purposes from Trajectories' + tags: + - "Workshop presentation" + - "KDD 2024" + links: + Slides: "/assets/KDD_2024_Workshop_PLM4Traj.pdf" + Paper: "https://arxiv.org/abs/2405.12459" + + - title: 'Origin-Destination Travel Time Oracle for Map-based Services' + tags: + - "Paper Oral" + - "SIGMOD 2024" + links: + Slides: "/assets/SIGMOD-Oral-PPT.pdf" + + - title: 'Self-supervised Learning of Spatial-temporal Trajectories' + tags: + - "Tutorial" + - "SpatialDI 2024" + links: + Slides: "/assets/Talk on SpatialDI 2024.pdf" + + - title: 'Pre-training Context and Time Aware Location Embeddings from Spatial-Temporal Trajectories for User Next Location Prediction' + tags: + - "Paper Oral" + - "AAAI 2021" + links: + Slides: "/assets/AAAI21 Oral PPT.pdf" + +services: + - "IEEE, ACM member" + - "Secretary of IEEE (Denmark Section) Computer Society" + - "Reviewer for journals including TIST, TII, and TVT" + - "Member of program committees of ICLR, KDD, AAAI, CVPR, ICCV, IJCAI, and WWW" + +blogs: + - title: "One Step Diffusion Models" + badge: "May 2025" + path: "one-step-diffusion-models" + tldr: "Despite the promising performance of diffusion models on continuous modality generation, one deficiency that is holding them back is their requirement for multi-step denoising processes, which can be computationally expensive. In this article, we examine recent works that aim to build diffusion models capable of performing sampling in one or a few steps." + + - title: "Multi-modal and Multi-function Transformers" + badge: "April 2025" + path: "multi-modal-transformer" + tldr: "Multi-modal and multi-function Transformers enables a single architecture to process diverse data types such as language, images, and videos simultaneously. These models employ techniques like vector quantization and lookup-free quantization to map different modalities into a unified embedding space, allowing the Transformer to handle them within the same sequence. Beyond processing multiple data types, these architectures can also combine different functionalities-such as auto-regressive language generation and diffusion-based image creation-within a single model." \ No newline at end of file diff --git a/dist/assets/AAAI21 Oral PPT.pdf b/dist/assets/AAAI21 Oral PPT.pdf new file mode 100644 index 0000000..0f43fa6 Binary files /dev/null and b/dist/assets/AAAI21 Oral PPT.pdf differ diff --git a/dist/assets/KDD_2024_Workshop_PLM4Traj.pdf b/dist/assets/KDD_2024_Workshop_PLM4Traj.pdf new file mode 100644 index 0000000..23c6ccc Binary files /dev/null and b/dist/assets/KDD_2024_Workshop_PLM4Traj.pdf differ diff --git a/dist/assets/SIGMOD-Oral-PPT.pdf b/dist/assets/SIGMOD-Oral-PPT.pdf new file mode 100644 index 0000000..8622a1d Binary files /dev/null and b/dist/assets/SIGMOD-Oral-PPT.pdf differ diff --git a/dist/assets/Self-supervised Learning of Trajectory Data.pdf b/dist/assets/Self-supervised Learning of Trajectory Data.pdf new file mode 100644 index 0000000..906cfee Binary files /dev/null and b/dist/assets/Self-supervised Learning of Trajectory Data.pdf differ diff --git a/dist/assets/Talk on SpatialDI 2024.pdf b/dist/assets/Talk on SpatialDI 2024.pdf new file mode 100644 index 0000000..d6ae47c Binary files /dev/null and b/dist/assets/Talk on SpatialDI 2024.pdf differ diff --git a/dist/blog/html/multi-modal-transformer.html b/dist/blog/html/multi-modal-transformer.html new file mode 100644 index 0000000..42dc413 --- /dev/null +++ b/dist/blog/html/multi-modal-transformer.html @@ -0,0 +1,229 @@ + + + + + + + Yan Lin's Blog - Multi-modal and Multi-function Transformers + + + + + + + + + + +

+ +

Multi-modal and Multi-function Transformers

Transformers have gained immense popularity within deep learning and AI communities in recent years. Since their introduction in Vaswani et al., "Attention Is All You Need", they have proven to be powerful sequential models across diverse domains, with thousands of variations and "improved versions." The rise of Large Language Models (LLMs), which largely use Transformers as their foundation, has led to another surge in research around this architecture. This trend has even led graph learning and Computer Vision (CV) communities to move beyond their established foundation models (i.e., GNNs and CNNs) and embrace Transformers. This explains the increasing prevalence of graph Transformers and image Transformers today.

+
Han et al., “A Survey on Vision Transformer”; Khan et al., “Transformers in Vision”; Yun et al., “Graph Transformer Networks.”
+

Beyond "chasing the trend," using Transformer as a unified foundation model offers several advantages:

Transformers excel at capturing long-term dependencies. Unlike GNNs and CNNs which require deeper network structures for longer context, Transformers natively support global dependency modeling through their self-attention mechanism. They also avoid global smoothing and vanishing gradient problems that hinder context length scaling in other network architectures.
Transformers process sequences in parallel rather than sequentially, enabling full utilization of GPU acceleration. This advantage can be further enhanced with techniques like those described in Dao et al., "FlashAttention."
Transformers are flexible network structures. They don't inherently enforce sequentiality—without positional encoding, the ordering of input steps to Transformers is equivalent. Through strategic permutation and positional encoding, Transformers can adapt to a wide range of structured and unstructured data.
The development of LLMs has made many open-weight Transformer models available with strong natural language understanding capabilities. These Transformers can be prompted and fine-tuned to model other modalities such as spatiotemporal data and images while retaining their language modeling abilities, creating opportunities for developing multi-modal foundation models.
From a practical perspective, using Transformer as a foundation allows reuse of technical infrastructure and optimizations developed over years, including efficient architecture designs, training pipelines, and specialized hardware.

In this article, we will briefly explore techniques for unifying multiple modalities (e.g., natural language and images) and multiple functionalities (e.g., language models and diffusion denoisers) within a single Transformer. These techniques are largely sourced from recent oral papers presented at ICML, ICLR, and CVPR conferences. I assume readers have general knowledge of basic concepts in ML and neural networks, Transformers, LLMs, and diffusion models.

Since images and language modalities represent continuous and discrete data respectively, we will use them as examples throughout this article. Keep in mind that the techniques introduced can be readily extended to other modalities, including spatiotemporal data.

General Goal

The goal of a multi-modal Transformer is to create a model that can accept multi-modal inputs and produce multi-modal outputs. For example, instead of using a CNN-based image encoder and a Transformer-based language encoder to map image and language modalities to the latent space separately, a multi-modal Transformer would be able to process the combination of image and language (sentence) as a single sequence.

An example of “conventional” multi-modal fusion. Different modality is processed by separate models and fused at some point. Source: *Xiang, Hao, Runsheng Xu, and Jiaqi Ma. "HM-ViT: Hetero-modal vehicle-to-vehicle cooperative perception with vision transformer." CVPR, 2023.*

image (1) — An example of a Transformer that can handle multi-modal inputs and outputs. Different modalities are all projected into tokens and subsequently processed by a unified Transformer encoder. Source: *Kondratyuk, Dan, Lijun Yu, et al. “VideoPoet: A Large Language Model for Zero-Shot Video Generation,” ICML, 2024.*

Beyond multi-modal processing, a multi-function Transformer can, for example, function as both a language model (auto-regressive generation) and diffusion denoiser (score-matching generation) simultaneously, supporting two of the most common generation schemes used today.

Modality Embedding

A fundamental challenge in unifying multiple modalities within a single Transformer is how to represent different modalities in the same embedding space. For the "QKV" self-attention mechanism to work properly, each item in the input sequence must be represented by an embedding vector of the same dimension, matching the "model dimension" of the Transformer.

image (2) — Illustration of the QKV self-attention mechanism in Transformer. Source

The most common method for mapping language into the embedding space is through tokenization and token embedding. A tokenizer maps a word or word fragment into a discrete token index, and an index-fetching embedding layer (implemented in frameworks like PyTorch with nn.Embedding) maps this index into a fixed-dimension embedding vector. In principle, all discrete features can be mapped into the embedding space using this approach.

1_Dk1X5rmLomXqqTPeuHgBpw — Visualization of tokenizer and index-fetching embedding layer. Source

Vector Quantization

For continuous features, one intuitive approach is to first tokenize them into discrete tokens, thereby unifying the embedding process across both discrete and continuous features. Vector quantization, introduced in VQ-VAE, is one of the most common methods for this purpose.

+
Van Den Oord, Aaron, and Oriol Vinyals. "Neural discrete representation learning." NeurIPS, 2017.
+

Vector quantization maintains a "codebook" $\boldsymbol C \in \mathbb R^{n\times d}$, which functions similarly to the index-fetching embedding layer, where $n$ is the total number of unique tokens, and $d$ is the embedding size. A given continuous vector $\boldsymbol{z}\in\mathbb R^{d}$ is quantized into a discrete value $i\in\mathbb [0,n-1]$ by finding the closest row vector in $\boldsymbol C$ to $\boldsymbol{z}$, and that row vector $\boldsymbol C_i$ is fetched as the embedding for $\boldsymbol{z}$. Formally: +$$ +i = \arg\min_j ||\boldsymbol z - \boldsymbol C_j||₂ +$$ + Screen_Shot_2020-06-28_at_4.26.40_PM

Lookup-Free Quantization

A significant limitation of vector quantization is that it requires calculating distances between the given continuous vectors and the entire codebook, which becomes computationally expensive for large-scale codebooks. This creates tension with the need for expanded codebooks to represent complex modalities such as images and videos. Research has shown that simply increasing the number of unique tokens doesn't always improve codebook performance.

+
“A simple trick for training a larger codebook involves decreasing the code embedding dimension when increasing the vocabulary size.” Source: Yu, Lijun, Jose Lezama, et al. “Language Model Beats Diffusion - Tokenizer Is Key to Visual Generation,” ICLR, 2024.
+

Building on this insight, Lookup-Free Quantization (LFQ) eliminates the embedding dimension of codebooks (essentially reducing the embedding dimension to 0) and directly calculates the discrete index $i$ by individually quantizing each dimension of $\boldsymbol z$ into a binary digit. The index $i$ can then be computed by converting the binary representation to decimal. Formally: +$$ +i=\sum_{j=1}^{d} 2^{(j-1)}\cdot 𝟙(z_j > 0) +$$

+
For example, given a continuous vector $\boldsymbol z=\langle -0.52, 1.50, 0.53, -1.32\rangle$, we first quantize each dimension into $\langle 0, 1, 1, 0\rangle$, based on the sign of each dimension. The token index of $\boldsymbol z$ is simply the decimal equivalent of the binary 0110, which is 6.
+

However, this approach introduces another challenge: we still need an index-fetching embedding layer to map these token indices into embedding vectors for the Transformer. This, combined with the typically large number of unique tokens when using LFQ—a 32-dimensional $\boldsymbol z$ will result in $2^{32}=4,294,967,296$ unique tokens—creates significant efficiency problems. One solution is to factorize the token space. Effectively, this means splitting the binary digits into multiple parts, embedding each part separately, and concatenating the resulting embedding vectors. For example, with a 32-dimensional $\boldsymbol z$, if we quantize and embed its first and last 16 dimensions separately, we “only” need to handle $2^{16}*2= 131,072$ unique tokens.

Note that this section doesn't extensively explain how to map raw continuous features into the vector $\boldsymbol{z}$, as these techniques are relatively straightforward and depend on the specific feature type—for example, fully-connected layers for numerical features, or CNN/GNN with feature flattening for structured data.

Quantization over Linear Projection

You might be asking—why can't we simply use linear projections to map the raw continuous features into the embedding space? What are the benefits of quantizing continuous features into discrete tokens?

Although Transformers are regarded as universal sequential models, they were designed for discrete tokens in their first introduction in Vaswani et al., "Attention Is All You Need". Empirically, they have optimal performance when dealing with tokens, compared to continuous features. This is supported by many research papers claiming that quantizing continuous features improves the performance of Transformers, and works demonstrating Transformers' subpar performance when applied directly to continuous features.

+
Mao, Chengzhi, Lu Jiang, Mostafa Dehghani, Carl Vondrick, Rahul Sukthankar, and Irfan Essa. “Discrete Representations Strengthen Vision Transformer Robustness,” ICLR, 2022.
+
Ilbert, Romain, Ambroise Odonnat, et al. “SAMformer: Unlocking the Potential of Transformers in Time Series Forecasting with Sharpness-Aware Minimization and Channel-Wise Attention,” ICML, 2024.
+

On the other hand, unifying different modalities into tokens is especially beneficial in the context of Transformer-based "foundation models," since it preserves the auto-regressive next-token prediction architecture of LLMs. Combined with special tokens such as "start of sentence" and "end of sentence," the Transformer model is flexible in generating contents of mixed modalities with varied length.

+
For example, by quantizing videos into discrete tokens and combining the token space of videos and language, one can create a unified Transformer model that generates both videos and language in one sequence. The start and end points of video and language sub-sequences are fully determined by the model, based on the specific input prompt. This structure would be difficult to replicate if we used tokenization for language but linear projection for videos.
+

Transformer Backbone

After different modalities are mapped into the same embedding space, they can be arranged into a sequence of embedding vectors and input into a Transformer backbone. We don't discuss the variations of Transformer structure and improvement techniques here, as they are numerous, and ultimately function similarly as sequential models.

+
Lan et al., “ALBERT”; Ye et al., “Differential Transformer”; Kitaev, Kaiser, and Levskaya, “Reformer”; Su et al., “RoFormer”; Dai et al., “Transformer-XL.”
+

As we know, the "full" Transformer structure proposed in Vaswani et al., "Attention Is All You Need" includes an encoder and a decoder. They perform self-attention within their respective input sequences, and the decoder additionally performs cross-attention between its input sequence and the memory sequence derived from the encoder's output. Some early language models use encoder-only structure (like Devlin et al., "BERT") focused on outputting embedding vectors or encoder-decoder structure (like Chung et al., "Scaling Instruction-Finetuned Language Models") for generating natural language output. Most modern large language models and foundation models use decoder-only structure (like Brown et al., "Language Models Are Few-Shot Learners"), focusing on auto-regressive generation of language output.

The encoder-only structure theoretically excels at representation learning, and its produced embedding vectors can be applied to various downstream tasks. Recent developments have gradually moved towards decoder-only structure, centered around the idea of building models that are capable of directly generating the required final output of every downstream task.

+
For example, to perform sentiment analysis, BERT will compute an embedding vector for the query sentence, and the embedding vector can be used in a dedicated classifier to predict the sentiment label. GPT, on the other hand, can directly answer the question "what is the sentiment associated with the query sentence?" Comparatively, GPT is more versatile in most cases and can easily perform zero-shot prediction.
+

Nevertheless, representation learning is still a relevant topic. The general understanding is that decoder-only structure cannot perform conventional representation learning, for example mapping a sentence into a fixed-dimension embedding vector. Yet, there are a few works in the latest ICLR that shed light on the utilization of LLMs as representation learning or embedding models:

+
Gao, Leo, Tom Dupre la Tour, Henk Tillman, Gabriel Goh, Rajan Troll, Alec Radford, Ilya Sutskever, Jan Leike, and Jeffrey Wu. “Scaling and Evaluating Sparse Autoencoders,” 2024. Link
+
Li, Ziyue, and Tianyi Zhou. “Your Mixture-of-Experts LLM Is Secretly an Embedding Model for Free,” 2024. Link
+
Zhang, Jie, Dongrui Liu, Chen Qian, Linfeng Zhang, Yong Liu, Yu Qiao, and Jing Shao. “REEF: Representation Encoding Fingerprints for Large Language Models,” 2024. Link
+

Output Layer

For language generation, Transformers typically use classifier output layers, mapping the latent vector of each item in the output sequence back to tokens. As we've established in the "modality embedding" section, the optimal method to embed continuous features is to quantize them into discrete tokens. Correspondingly, an intuitive method to output continuous features is to map these discrete tokens back to the continuous feature space, essentially reversing the vector quantization process.

Reverse Vector Quantization

One approach to reverse vector quantization is readily available in VQ-VAE, since it is an auto-encoder. Given a token $i$, we can look up its embedding in the codebook as $\boldsymbol C_i$, then apply a decoder network to map $\boldsymbol C_i$ back to the continuous feature vector $\boldsymbol z$. The decoder network can be pre-trained in the VQ-VAE framework—pre-train the VQ-VAE tokenizer, encoder, and decoder using auto-encoding loss functions, or end-to-end trained along with the whole Transformer. In the NLP and CV communities, the pre-training approach is more popular, since there are many large-scale pre-trained auto-encoders available.

image (4) — The encoder-decoder structure of MAGVIT (*Yu et al., “MAGVIT”*), a visual VQ-VAE model. A 3D-VQ encoder quantizes a video into discrete tokens, and a 3D-VQ decoder maps them back to the pixel space.

Efficiency Enhancement

For continuous feature generation, unlike language generation where the output tokens are the final output, we are essentially representing the final output with a limited size token space. Thus, for complicated continuous features like images and videos, we have to expand the token space or use more tokens to represent one image or one video frame to improve generation quality, which can result in efficiency challenges.

There are several workarounds to improve the efficiency of multi-modal outputs. One approach is to generate low-resolution outputs first, then use a separate super-resolution module to improve the quality of the output. This approach is explored in Kondratyuk et al., "VideoPoet" and Tian et al., "Visual Autoregressive Modeling". Interestingly, the overall idea is very similar to nVidia's DLSS, where the graphics card renders a low-resolution frame (e.g., 1080p) using the conventional rasterization pipeline, then a super resolution model increases the frame's resolution (e.g., 4k) utilizing the graphics card's tensor hardware, improving games' overall frame rate.

Another workaround follows the idea of compression. Take video generation as an example. The model generates full features for key frames, and light-weight features for motion vectors that describe subtle differences from those key frames. This is essentially how inter-frame compressed video codecs work, which takes advantage of temporal redundancy between neighboring frames.

image (5) — Keys frames and motion vectors used in *Jin et al., “Video-LaVIT.”*

Fuse with Diffusion Models

Despite continuous efforts to enable representation and generation of images and videos with a language model structure (auto-regressive), current research indicates that diffusion models (more broadly speaking, score-matching generative models) outperform language models on continuous feature generation. Score-matching generative models have their own separate and substantial community, with strong theoretical foundations and numerous variations emerging each year, such as stochastic differential equations, bayesian flow, and rectified flow. In conclusion, score-matching generative models are clearly here to stay alongside language models.

An intriguing question arises: why not integrate the structures of language models and diffusion models into one Transformer to reach the best of both worlds? Zhou et al. in "Transfusion" explored this idea. The approach is straightforward: build a Transformer that can handle both language and image inputs and outputs. The language component functions as a language model, while the image component serves as a denoiser network for diffusion models. The model is trained by combining the language modeling loss and DDPM loss, enabling it to function either as a language model or a text-to-image denoiser.

image (6) — A Transformer capable of function as a language model and a diffusion denoiser at the same time. Source: *Zhou, Chunting, Lili Yu, et al. “Transfusion: Predict the Next Token and Diffuse Images with One Multi-Modal Model,” ICLR, 2025.*

Conclusion

In conclusion, the evolution of Transformers into versatile foundation models capable of handling multiple modalities and functionalities represents a significant advancement in AI research. By enabling a single architecture to process diverse data types through techniques like vector quantization and lookup-free quantization, researchers have created models that can seamlessly integrate language, images, and other modalities within the same embedding space.

In our research domain, we encounter even more diverse and domain-specific multi-modal data, such as traffic flows, trajectories, and real-world agent interactions. A unified Transformer for such data presents a promising solution for creating "foundation models" that generalize across diverse tasks and scenarios. However, domain-specific challenges, including data encoding and decoding, computational efficiency, and scalability, must be addressed to realize this potential.

+ + + + + \ No newline at end of file diff --git a/dist/blog/html/one-step-diffusion-models.html b/dist/blog/html/one-step-diffusion-models.html new file mode 100644 index 0000000..7d2f108 --- /dev/null +++ b/dist/blog/html/one-step-diffusion-models.html @@ -0,0 +1,216 @@ + + + + + + + Yan Lin's Blog - One Step Diffusion Models + + + + + + + + + + +

+ +

One Step Diffusion Models

Despite the promising performance of diffusion models on continuous modality generation, one deficiency that is holding them back is their requirement for multi-step denoising processes, which can be computationally expensive. In this article, we examine recent works that aim to build diffusion models capable of performing sampling in one or a few steps.

Background

Diffusion models (DMs), or more broadly speaking, score-matching generative models, have become the de facto framework for building deep generation models. They demonstrate exceptional generation performance, especially on continuous modalities including images, videos, audios, and spatiotemporal data.

Most diffusion models work by coupling a forward diffusion process and a reverse denoising diffusion process. The forward diffusion process gradually adds noise to the ground truth clean data $X_0$, until noisy data $X_T$ that follows a relatively simple distribution is reached. The reverse denoising diffusion process starts from the noisy data $X_T$, and removes the noise component step-by-step until clean generated data $X_0$ is reached. The reverse process is essentially a Monte-Carlo process, meaning it cannot be parallelized for each generation, which can be inefficient for a process with a large number of steps.

The two processes in a typical diffusion model. *Source: Ho, Jain, and Abbeel, “Denoising Diffusion Probabilistic Models.”*

Understanding DMs

There are many ways to understand how Diffusion Models (DMs) work. One of the most common and intuitive approaches is that a DM learns an ordinary differential equation (ODE) that transforms noise into data. Imagine an ODE vector field between the noise $X_T$ and clean data $X_0$. By training on sufficiently large numbers of timesteps $t\in [0,T]$, a DM is able to learn the vector (tangent) towards the cleaner data $X_{t-\Delta t}$, given any specific timestep $t$ and the corresponding noisy data $X_t$. This idea is easy to illustrate in a simplified 1-dimensional data scenario.

Illustrated ODE flow of a diffusion model on 1-dimensional data. *Source: Song et al., “Score-Based Generative Modeling through Stochastic Differential Equations.”* It should be noted that as the figure suggests, there are differences between ODEs and DMs in a narrow sense. Flow matching models, a variant of DMs, more closely resemble ODEs.

DMs Scale Poorly with Few Steps

Vanilla DDPM, which is essentially a discrete-timestep DM, can only perform the reverse process using the same number of steps it is trained on, typically thousands. DDIM introduces a reparameterization scheme that enables skipping steps during the reverse process of DDPM. Continuous-timestep DMs like Stochastic Differential Equations (SDE) naturally possess the capability of using fewer steps in the reverse process compared to the forward process/training.

+
Ho, Jain, and Abbeel, “Denoising Diffusion Probabilistic Models.” +Song, Meng, and Ermon, “Denoising Diffusion Implicit Models.” +Song et al., “Score-Based Generative Modeling through Stochastic Differential Equations.”
+

Nevertheless, it is observed that their performance typically suffers catastrophic degradation when reducing the number of reverse process steps to single digits.

Images generated by conventional DMs with only a few steps of reverse process. *Source: Frans et al., “One Step Diffusion via Shortcut Models.”*

To understand why DMs scale poorly with few reverse process steps, we can return to the ODE vector field perspective of DMs. When the target data distribution is complex, the vector field typically contains numerous intersections. When a given $X_t$ and $t$ is at these intersections, the vector points to the averaged direction of all candidates. This causes the generated data to approach the mean of the training data when only a few reverse process steps are used. Another explanation is that the learned vector field is highly curved. Using only a few reverse process steps means attempting to approximate these curves with polylines, which is inherently difficult.

Illustration of the why DMs scale poorly with few reverse process steps. *Source: Frans et al., “One Step Diffusion via Shortcut Models.”*

We will introduce two branches of methods that aim to scale DMs to few or even reverse process steps: distillation-based, which distillates a pre-trained DM into a one-step model; and end-to-end-based, which trains a one-step DM from scratch.

Distallation

Distillation-based methods are also called rectified flow methods. Their idea follows the above insight of "curved ODE vector field": if the curved vectors (flows) are hindering the scaling of reverse process steps, can we try to straighten these vectors so that they are easy to approximate with polylines or even straight lines?

Liu, Gong, and Liu, "Flow Straight and Fast" implements this idea, focusing on learning an ODE that follows straight vectors as much as possible. In the context of continuous time DMs where $T=1$ and and $t\in[0,1]$, suppose the clean data $X_0$ and noise $X_1$ each follows a data distribution, $X_0\sim \pi_0$ and $X_1\sim \pi_1$. The "straight vectors" can be achieved by solving a nonlinear least squares optimization problem: +$$ +\min_{v} \int_{0}^{1} \mathbb{E}\left[\left|\left(X_{1}-X_{0}\right)-v\left(X_{t}, t\right)\right|^{2}\right] \mathrm{d} t, +$$

\[ +\quad X_{t}=t X_{1}+(1-t) X_{0} +\]

Where $v$ is the vector field of the ODE $dZ_t = v(Z_t,t)dt$.

Though straightforward, when the clean data distribution $\pi_0$ is very complicated, the ideal result of completely straight vectors can be hard to achieve. To address this, a "reflow" procedure is introduced. This procedure iteratively trains new rectified flows using data generated by previously obtained flows: +$$ +Z^{(k+1)} = RectFlow((Z_0^k, Z_1^k)) +$$ +This procedure produces increasingly straight flows that can be simulated with very few steps, ideally one step after several iterations.

Illustrations of vector fields after different times of reflow processes. *Source: Liu, Gong, and Liu, “Flow Straight and Fast.”*

In practice, distillation-based methods are usually trained in two stages: first train a normal DM, and later distill one-step capabilities into it. This introduces additional computational overhead and complexity.

End-to-end

Compared to distillation-based methods, end-to-end-based methods train a one-step-capable diffusion model (DM) within a single training run. Various techniques are used to implement such methods. We will focus on two of them: consistency models and shortcut models.

Consistency Models

In discrete-timestep diffusion models (DMs), three components in the reverse denoising diffusion process are interchangeable through reparameterization: the noise component $\epsilon_t$ to remove, the less noisy previous step $x_{t-1}$, and the predicted clean sample $x_0$. This interchangeability is enabled by the following equation: +$$ +x_t = \sqrt{\bar{\alpha}_t} \, x_0 + \sqrt{1 - \bar{\alpha}_t} \, \epsilon_t +$$ +In theory, without altering the fundamental formulation of DMs, the learnable denoiser network can be designed to predict any of these three components. Consistency models (CMs) follow this principle by training the denoiser to specifically predict the clean sample $x_0$. The benefit of this approach is that CMs can naturally scale to perform the reverse process with few steps or even a single step.

Formally, CMs learn a function $f_\theta(x_t,t)$ that maps noisy data $x_t$ at time $t$ directly to the clean data $x_0$, satisfying: +$$ +f_\theta(x_t, t) = f_\theta(x_{t'}, t') \quad \forall t, t' +$$ +The model must also obey the differential consistency condition: +$$ +\frac{d}{dt} f_\theta(x_t, t) = 0 +$$ +CMs are trained by minimizing the discrepancy between outputs at adjacent times, with the loss function: +$$ +\mathcal{L} = \mathbb{E} \left[ d\left(f_\theta(x_t, t), f_\theta(x_{t'}, t')\right) \right] +$$ +Similar to continuous-timestep DMs and discrete-timestep DMs, CMs also have continuous-time and discrete-time variants. Discrete-time CMs are easier to train, but are more sensitive to timestep scheduling and suffer from discretization errors. Continuous-time CMs, on the other hand, suffer from instability during training.

For a deeper discussion of the differences between the two variants of CMs, and how to stabilize continuous-time CMs, please refer to Lu and Song, "Simplifying, Stabilizing and Scaling Continuous-Time Consistency Models."

Shortcut Models

Similar to distillation-based methods, the core idea of shortcut models is inspired by the "curved vector field" problem, but the shortcut models take a different approach to solve it.

Shortcut models are introduced in Frans et al., "One Step Diffusion via Shortcut Models." The paper presents the insight that conventional DMs perform badly when jumping with large step sizes stems from their lack of awareness of the step size they are set to jump forward. Since they are only trained to comply with small step sizes, they are only learning the tangents in the curved vector field, not the "correct direction" when a large step size is used.

Based on this insight, on top of $x_t$ and $t$, shortcut models additionally include step size $d$ as part of the condition for the denoiser network. At small step sizes ($d\rightarrow 0$), the model behaves like a standard flow-matching model, learning the expected tangent from noise to data. For larger step sizes, the model learns that one large step should equal two consecutive smaller steps (self-consistency), creating a binary recursive formulation. The model is trained by combining the standard flow matching loss when $d=0$ and the self-consistency loss when $d>0$: +$$ +\mathcal{L} = \mathbb{E} [ \underbrace{| s_\theta(x_t, t, 0) - (x_1 - x_0)|^2}_{\text{Flow-Matching}} + +$$

\[ +\underbrace{\|s_\theta(x_t, t, 2d) - \mathbf{s}_{\text{target}}\|^2}_{\text{Self-Consistency}}], +\]

\[ +\quad \mathbf{s}_{\text{target}} = s_\theta(x_t, t, d)/2 + s_\theta(x'_{t+d}, t + d, d)/2 \quad +\]

\[ +\text{and} \quad x'_{t+d} = x_t + s_\theta(x_t, t, d)d +\]

Illustration of the training process of shortcut models. *Source: Frans et al., “One Step Diffusion via Shortcut Models.”*

Both consistency models and shortcut models can be seamlessly scaled between one-step and multi-step generation to balance quality and efficiency.

+ + + + + \ No newline at end of file diff --git a/dist/blog/index.html b/dist/blog/index.html new file mode 100644 index 0000000..881c8f8 --- /dev/null +++ b/dist/blog/index.html @@ -0,0 +1,96 @@ + + + + + + + Yan Lin's Blog + + + + + + + + +

+ +

+ + + + + +

+ +

+ + + +

+ + +

+ +

+ One Step Diffusion Models | May 2025 +

+ +

+ Multi-modal and Multi-function Transformers | April 2025 +

Multi-modal and multi-function Transformers enables a single architecture to process diverse data types such as language, images, and videos simultaneously. These models employ techniques like vector quantization and lookup-free quantization to map different modalities into a unified embedding space, allowing the Transformer to handle them within the same sequence. Beyond processing multiple data types, these architectures can also combine different functionalities-such as auto-regressive language generation and diffusion-based image creation-within a single model.

+ +

+ + + + + + + + + + + \ No newline at end of file diff --git a/dist/blog/md/multi-modal-transformer.assets/1_Dk1X5rmLomXqqTPeuHgBpw.png b/dist/blog/md/multi-modal-transformer.assets/1_Dk1X5rmLomXqqTPeuHgBpw.png new file mode 100644 index 0000000..9315270 Binary files /dev/null and b/dist/blog/md/multi-modal-transformer.assets/1_Dk1X5rmLomXqqTPeuHgBpw.png differ diff --git a/dist/blog/md/multi-modal-transformer.assets/Screen_Shot_2020-06-28_at_4.26.40_PM.png b/dist/blog/md/multi-modal-transformer.assets/Screen_Shot_2020-06-28_at_4.26.40_PM.png new file mode 100644 index 0000000..0e41599 Binary files /dev/null and b/dist/blog/md/multi-modal-transformer.assets/Screen_Shot_2020-06-28_at_4.26.40_PM.png differ diff --git a/dist/blog/md/multi-modal-transformer.assets/image (1).png b/dist/blog/md/multi-modal-transformer.assets/image (1).png new file mode 100644 index 0000000..11d9b92 Binary files /dev/null and b/dist/blog/md/multi-modal-transformer.assets/image (1).png differ diff --git a/dist/blog/md/multi-modal-transformer.assets/image (2).png b/dist/blog/md/multi-modal-transformer.assets/image (2).png new file mode 100644 index 0000000..a7ef23b Binary files /dev/null and b/dist/blog/md/multi-modal-transformer.assets/image (2).png differ diff --git a/dist/blog/md/multi-modal-transformer.assets/image (3).png b/dist/blog/md/multi-modal-transformer.assets/image (3).png new file mode 100644 index 0000000..d28e865 Binary files /dev/null and b/dist/blog/md/multi-modal-transformer.assets/image (3).png differ diff --git a/dist/blog/md/multi-modal-transformer.assets/image (4).png b/dist/blog/md/multi-modal-transformer.assets/image (4).png new file mode 100644 index 0000000..9e97efb Binary files /dev/null and b/dist/blog/md/multi-modal-transformer.assets/image (4).png differ diff --git a/dist/blog/md/multi-modal-transformer.assets/image (5).png b/dist/blog/md/multi-modal-transformer.assets/image (5).png new file mode 100644 index 0000000..e1c27c4 Binary files /dev/null and b/dist/blog/md/multi-modal-transformer.assets/image (5).png differ diff --git a/dist/blog/md/multi-modal-transformer.assets/image (6).png b/dist/blog/md/multi-modal-transformer.assets/image (6).png new file mode 100644 index 0000000..47ff387 Binary files /dev/null and b/dist/blog/md/multi-modal-transformer.assets/image (6).png differ diff --git a/dist/blog/md/multi-modal-transformer.assets/image.png b/dist/blog/md/multi-modal-transformer.assets/image.png new file mode 100644 index 0000000..7db5437 Binary files /dev/null and b/dist/blog/md/multi-modal-transformer.assets/image.png differ diff --git a/dist/blog/md/multi-modal-transformer.md b/dist/blog/md/multi-modal-transformer.md new file mode 100644 index 0000000..5f9af3a --- /dev/null +++ b/dist/blog/md/multi-modal-transformer.md @@ -0,0 +1,148 @@ +# Multi-modal and Multi-function Transformers + +Transformers have gained immense popularity within deep learning and AI communities in recent years. Since their introduction in *Vaswani et al., "Attention Is All You Need"*, they have proven to be powerful sequential models across diverse domains, with thousands of variations and "improved versions." The rise of Large Language Models (LLMs), which largely use Transformers as their foundation, has led to another surge in research around this architecture. This trend has even led graph learning and Computer Vision (CV) communities to move beyond their established foundation models (i.e., GNNs and CNNs) and embrace Transformers. This explains the increasing prevalence of graph Transformers and image Transformers today. + +> Han et al., “A Survey on Vision Transformer”; Khan et al., “Transformers in Vision”; Yun et al., “Graph Transformer Networks.” + +Beyond "chasing the trend," using Transformer as a unified foundation model offers several advantages: + +- Transformers excel at capturing long-term dependencies. Unlike GNNs and CNNs which require deeper network structures for longer context, Transformers natively support global dependency modeling through their self-attention mechanism. They also avoid global smoothing and vanishing gradient problems that hinder context length scaling in other network architectures. +- Transformers process sequences in parallel rather than sequentially, enabling full utilization of GPU acceleration. This advantage can be further enhanced with techniques like those described in *Dao et al., "FlashAttention."* +- Transformers are flexible network structures. They don't inherently enforce sequentiality—without positional encoding, the ordering of input steps to Transformers is equivalent. Through strategic permutation and positional encoding, Transformers can adapt to a wide range of structured and unstructured data. +- The development of LLMs has made many open-weight Transformer models available with strong natural language understanding capabilities. These Transformers can be prompted and fine-tuned to model other modalities such as spatiotemporal data and images while retaining their language modeling abilities, creating opportunities for developing multi-modal foundation models. +- From a practical perspective, using Transformer as a foundation allows reuse of technical infrastructure and optimizations developed over years, including efficient architecture designs, training pipelines, and specialized hardware. + +In this article, we will briefly explore techniques for unifying multiple modalities (e.g., natural language and images) and multiple functionalities (e.g., language models and diffusion denoisers) within a single Transformer. These techniques are largely sourced from recent oral papers presented at ICML, ICLR, and CVPR conferences. I assume readers have general knowledge of basic concepts in ML and neural networks, Transformers, LLMs, and diffusion models. + +Since images and language modalities represent continuous and discrete data respectively, we will use them as examples throughout this article. Keep in mind that the techniques introduced can be readily extended to other modalities, including spatiotemporal data. + +# General Goal + +The goal of a multi-modal Transformer is to create a model that can accept multi-modal inputs and produce multi-modal outputs. For example, instead of using a CNN-based image encoder and a Transformer-based language encoder to map image and language modalities to the latent space separately, a multi-modal Transformer would be able to process the combination of image and language (sentence) as a single sequence. + +![image](multi-modal-transformer.assets/image.png) + +> An example of “conventional” multi-modal fusion. Different modality is processed by separate models and fused at some point. Source: *Xiang, Hao, Runsheng Xu, and Jiaqi Ma. "HM-ViT: Hetero-modal vehicle-to-vehicle cooperative perception with vision transformer." CVPR, 2023.* + +![image (1)](multi-modal-transformer.assets/image (1).png) + +> An example of a Transformer that can handle multi-modal inputs and outputs. Different modalities are all projected into tokens and subsequently processed by a unified Transformer encoder. Source: *Kondratyuk, Dan, Lijun Yu, et al. “VideoPoet: A Large Language Model for Zero-Shot Video Generation,” ICML, 2024.* + +Beyond multi-modal processing, a multi-function Transformer can, for example, function as both a language model (auto-regressive generation) and diffusion denoiser (score-matching generation) simultaneously, supporting two of the most common generation schemes used today. + +# Modality Embedding + +A fundamental challenge in unifying multiple modalities within a single Transformer is how to represent different modalities in the same embedding space. For the "QKV" self-attention mechanism to work properly, each item in the input sequence must be represented by an embedding vector of the same dimension, matching the "model dimension" of the Transformer. + +![image (2)](multi-modal-transformer.assets/image (2).png) + +> Illustration of the QKV self-attention mechanism in Transformer. [Source](https://en.wikipedia.org/wiki/Attention_(machine_learning)) + +The most common method for mapping language into the embedding space is through tokenization and token embedding. A tokenizer maps a word or word fragment into a discrete token index, and an index-fetching embedding layer (implemented in frameworks like PyTorch with `nn.Embedding`) maps this index into a fixed-dimension embedding vector. In principle, all discrete features can be mapped into the embedding space using this approach. + +![1_Dk1X5rmLomXqqTPeuHgBpw](multi-modal-transformer.assets/1_Dk1X5rmLomXqqTPeuHgBpw.png) + +> Visualization of tokenizer and index-fetching embedding layer. [Source](https://medium.com/@hunter-j-phillips/the-embedding-layer-27d9c980d124) + +## Vector Quantization + +For continuous features, one intuitive approach is to first tokenize them into discrete tokens, thereby unifying the embedding process across both discrete and continuous features. **Vector quantization**, introduced in VQ-VAE, is one of the most common methods for this purpose. + +> Van Den Oord, Aaron, and Oriol Vinyals. "Neural discrete representation learning." NeurIPS, 2017. + +Vector quantization maintains a "codebook" $\boldsymbol C \in \mathbb R^{n\times d}$, which functions similarly to the index-fetching embedding layer, where $n$ is the total number of unique tokens, and $d$ is the embedding size. A given continuous vector $\boldsymbol{z}\in\mathbb R^{d}$ is quantized into a discrete value $i\in\mathbb [0,n-1]$ by finding the closest row vector in $\boldsymbol C$ to $\boldsymbol{z}$, and that row vector $\boldsymbol C_i$ is fetched as the embedding for $\boldsymbol{z}$. Formally: +$$ +i = \arg\min_j ||\boldsymbol z - \boldsymbol C_j||₂ +$$ +![Screen_Shot_2020-06-28_at_4.26.40_PM](multi-modal-transformer.assets/Screen_Shot_2020-06-28_at_4.26.40_PM.png) + +## Lookup-Free Quantization + +A significant limitation of vector quantization is that it requires calculating distances between the given continuous vectors and the entire codebook, which becomes computationally expensive for large-scale codebooks. This creates tension with the need for expanded codebooks to represent complex modalities such as images and videos. Research has shown that simply increasing the number of unique tokens doesn't always improve codebook performance. + +> “A simple trick for training a larger codebook involves decreasing the code embedding dimension when increasing the vocabulary size.” Source: *Yu, Lijun, Jose Lezama, et al. “Language Model Beats Diffusion - Tokenizer Is Key to Visual Generation,” ICLR, 2024.* + +Building on this insight, **Lookup-Free Quantization** (LFQ) eliminates the embedding dimension of codebooks (essentially reducing the embedding dimension to 0) and directly calculates the discrete index $i$ by individually quantizing each dimension of $\boldsymbol z$ into a binary digit. The index $i$ can then be computed by converting the binary representation to decimal. Formally: +$$ +i=\sum_{j=1}^{d} 2^{(j-1)}\cdot 𝟙(z_j > 0) +$$ + +> For example, given a continuous vector $\boldsymbol z=\langle -0.52, 1.50, 0.53, -1.32\rangle$, we first quantize each dimension into $\langle 0, 1, 1, 0\rangle$, based on the sign of each dimension. The token index of $\boldsymbol z$ is simply the decimal equivalent of the binary 0110, which is 6. + +However, this approach introduces another challenge: we still need an index-fetching embedding layer to map these token indices into embedding vectors for the Transformer. This, combined with the typically large number of unique tokens when using LFQ—a 32-dimensional $\boldsymbol z$ will result in $2^{32}=4,294,967,296$ unique tokens—creates significant efficiency problems. One solution is to factorize the token space. Effectively, this means splitting the binary digits into multiple parts, embedding each part separately, and concatenating the resulting embedding vectors. For example, with a 32-dimensional $\boldsymbol z$, if we quantize and embed its first and last 16 dimensions separately, we “only” need to handle $2^{16}*2= 131,072$ unique tokens. + +Note that this section doesn't extensively explain how to map raw continuous features into the vector $\boldsymbol{z}$, as these techniques are relatively straightforward and depend on the specific feature type—for example, fully-connected layers for numerical features, or CNN/GNN with feature flattening for structured data. + +## Quantization over Linear Projection + +You might be asking—why can't we simply use linear projections to map the raw continuous features into the embedding space? What are the benefits of quantizing continuous features into discrete tokens? + +Although Transformers are regarded as universal sequential models, they were designed for discrete tokens in their first introduction in *Vaswani et al., "Attention Is All You Need"*. Empirically, they have optimal performance when dealing with tokens, compared to continuous features. This is supported by many research papers claiming that quantizing continuous features improves the performance of Transformers, and works demonstrating Transformers' subpar performance when applied directly to continuous features. + +> Mao, Chengzhi, Lu Jiang, Mostafa Dehghani, Carl Vondrick, Rahul Sukthankar, and Irfan Essa. “Discrete Representations Strengthen Vision Transformer Robustness,” ICLR, 2022. + +> Ilbert, Romain, Ambroise Odonnat, et al. “SAMformer: Unlocking the Potential of Transformers in Time Series Forecasting with Sharpness-Aware Minimization and Channel-Wise Attention,” ICML, 2024. + +On the other hand, unifying different modalities into tokens is especially beneficial in the context of Transformer-based "foundation models," since it preserves the auto-regressive next-token prediction architecture of LLMs. Combined with special tokens such as "start of sentence" and "end of sentence," the Transformer model is flexible in generating contents of mixed modalities with varied length. + +> For example, by quantizing videos into discrete tokens and combining the token space of videos and language, one can create a unified Transformer model that generates both videos and language in one sequence. The start and end points of video and language sub-sequences are fully determined by the model, based on the specific input prompt. This structure would be difficult to replicate if we used tokenization for language but linear projection for videos. + +# Transformer Backbone + +After different modalities are mapped into the same embedding space, they can be arranged into a sequence of embedding vectors and input into a Transformer backbone. We don't discuss the variations of Transformer structure and improvement techniques here, as they are numerous, and ultimately function similarly as sequential models. + +> Lan et al., “ALBERT”; Ye et al., “Differential Transformer”; Kitaev, Kaiser, and Levskaya, “Reformer”; Su et al., “RoFormer”; Dai et al., “Transformer-XL.” + +As we know, the "full" Transformer structure proposed in *Vaswani et al., "Attention Is All You Need"* includes an encoder and a decoder. They perform self-attention within their respective input sequences, and the decoder additionally performs cross-attention between its input sequence and the memory sequence derived from the encoder's output. Some early language models use encoder-only structure (like *Devlin et al., "BERT"*) focused on outputting embedding vectors or encoder-decoder structure (like *Chung et al., "Scaling Instruction-Finetuned Language Models"*) for generating natural language output. Most modern large language models and foundation models use decoder-only structure (like *Brown et al., "Language Models Are Few-Shot Learners"*), focusing on auto-regressive generation of language output. + +The encoder-only structure theoretically excels at representation learning, and its produced embedding vectors can be applied to various downstream tasks. Recent developments have gradually moved towards decoder-only structure, centered around the idea of building models that are capable of directly generating the required final output of every downstream task. + +> For example, to perform sentiment analysis, BERT will compute an embedding vector for the query sentence, and the embedding vector can be used in a dedicated classifier to predict the sentiment label. GPT, on the other hand, can directly answer the question "what is the sentiment associated with the query sentence?" Comparatively, GPT is more versatile in most cases and can easily perform zero-shot prediction. + +Nevertheless, representation learning is still a relevant topic. The general understanding is that decoder-only structure cannot perform conventional representation learning, for example mapping a sentence into a fixed-dimension embedding vector. Yet, there are a few works in the latest ICLR that shed light on the utilization of LLMs as representation learning or embedding models: + +> Gao, Leo, Tom Dupre la Tour, Henk Tillman, Gabriel Goh, Rajan Troll, Alec Radford, Ilya Sutskever, Jan Leike, and Jeffrey Wu. “Scaling and Evaluating Sparse Autoencoders,” 2024. [Link](https://openreview.net/forum?id=tcsZt9ZNKD) + +> Li, Ziyue, and Tianyi Zhou. “Your Mixture-of-Experts LLM Is Secretly an Embedding Model for Free,” 2024. [Link](https://openreview.net/forum?id=eFGQ97z5Cd) + +> Zhang, Jie, Dongrui Liu, Chen Qian, Linfeng Zhang, Yong Liu, Yu Qiao, and Jing Shao. “REEF: Representation Encoding Fingerprints for Large Language Models,” 2024. [Link](https://openreview.net/forum?id=SnDmPkOJ0T) + +# Output Layer + +For language generation, Transformers typically use classifier output layers, mapping the latent vector of each item in the output sequence back to tokens. As we've established in the "modality embedding" section, the optimal method to embed continuous features is to quantize them into discrete tokens. Correspondingly, an intuitive method to output continuous features is to map these discrete tokens back to the continuous feature space, essentially reversing the vector quantization process. + +## Reverse Vector Quantization + +One approach to reverse vector quantization is readily available in VQ-VAE, since it is an auto-encoder. Given a token $i$, we can look up its embedding in the codebook as $\boldsymbol C_i$, then apply a decoder network to map $\boldsymbol C_i$ back to the continuous feature vector $\boldsymbol z$. The decoder network can be pre-trained in the VQ-VAE framework—pre-train the VQ-VAE tokenizer, encoder, and decoder using auto-encoding loss functions, or end-to-end trained along with the whole Transformer. In the NLP and CV communities, the pre-training approach is more popular, since there are many large-scale pre-trained auto-encoders available. + +![image (4)](multi-modal-transformer.assets/image (4).png) + +> The encoder-decoder structure of MAGVIT (*Yu et al., “MAGVIT”*), a visual VQ-VAE model. A 3D-VQ encoder quantizes a video into discrete tokens, and a 3D-VQ decoder maps them back to the pixel space. + +## Efficiency Enhancement + +For continuous feature generation, unlike language generation where the output tokens are the final output, we are essentially representing the final output with a limited size token space. Thus, for complicated continuous features like images and videos, we have to expand the token space or use more tokens to represent one image or one video frame to improve generation quality, which can result in efficiency challenges. + +There are several workarounds to improve the efficiency of multi-modal outputs. One approach is to generate low-resolution outputs first, then use a separate super-resolution module to improve the quality of the output. This approach is explored in *Kondratyuk et al., "VideoPoet"* and *Tian et al., "Visual Autoregressive Modeling"*. Interestingly, the overall idea is very similar to nVidia's DLSS, where the graphics card renders a low-resolution frame (e.g., 1080p) using the conventional rasterization pipeline, then a super resolution model increases the frame's resolution (e.g., 4k) utilizing the graphics card's tensor hardware, improving games' overall frame rate. + +Another workaround follows the idea of compression. Take video generation as an example. The model generates full features for key frames, and light-weight features for motion vectors that describe subtle differences from those key frames. This is essentially how inter-frame compressed video codecs work, which takes advantage of temporal redundancy between neighboring frames. + +![image (5)](multi-modal-transformer.assets/image (5).png) + +> Keys frames and motion vectors used in *Jin et al., “Video-LaVIT.”* + +# Fuse with Diffusion Models + +Despite continuous efforts to enable representation and generation of images and videos with a language model structure (auto-regressive), current research indicates that diffusion models (more broadly speaking, score-matching generative models) outperform language models on continuous feature generation. Score-matching generative models have their own separate and substantial community, with strong theoretical foundations and numerous variations emerging each year, such as stochastic differential equations, bayesian flow, and rectified flow. In conclusion, score-matching generative models are clearly here to stay alongside language models. + +An intriguing question arises: why not integrate the structures of language models and diffusion models into one Transformer to reach the best of both worlds? *Zhou et al. in "Transfusion"* explored this idea. The approach is straightforward: build a Transformer that can handle both language and image inputs and outputs. The language component functions as a language model, while the image component serves as a denoiser network for diffusion models. The model is trained by combining the language modeling loss and DDPM loss, enabling it to function either as a language model or a text-to-image denoiser. + +![image (6)](multi-modal-transformer.assets/image (6).png) + +> A Transformer capable of function as a language model and a diffusion denoiser at the same time. Source: *Zhou, Chunting, Lili Yu, et al. “Transfusion: Predict the Next Token and Diffuse Images with One Multi-Modal Model,” ICLR, 2025.* + +# Conclusion + +In conclusion, the evolution of Transformers into versatile foundation models capable of handling multiple modalities and functionalities represents a significant advancement in AI research. By enabling a single architecture to process diverse data types through techniques like vector quantization and lookup-free quantization, researchers have created models that can seamlessly integrate language, images, and other modalities within the same embedding space. + +In our research domain, we encounter even more diverse and domain-specific multi-modal data, such as traffic flows, trajectories, and real-world agent interactions. A unified Transformer for such data presents a promising solution for creating "foundation models" that generalize across diverse tasks and scenarios. However, domain-specific challenges, including data encoding and decoding, computational efficiency, and scalability, must be addressed to realize this potential. \ No newline at end of file diff --git a/dist/blog/md/one-step-diffusion-models.assets/image-20250503125941212.png b/dist/blog/md/one-step-diffusion-models.assets/image-20250503125941212.png new file mode 100644 index 0000000..5b261ec Binary files /dev/null and b/dist/blog/md/one-step-diffusion-models.assets/image-20250503125941212.png differ diff --git a/dist/blog/md/one-step-diffusion-models.assets/image-20250503132738122.png b/dist/blog/md/one-step-diffusion-models.assets/image-20250503132738122.png new file mode 100644 index 0000000..c3c493c Binary files /dev/null and b/dist/blog/md/one-step-diffusion-models.assets/image-20250503132738122.png differ diff --git a/dist/blog/md/one-step-diffusion-models.assets/image-20250503135351246.png b/dist/blog/md/one-step-diffusion-models.assets/image-20250503135351246.png new file mode 100644 index 0000000..f8dd53f Binary files /dev/null and b/dist/blog/md/one-step-diffusion-models.assets/image-20250503135351246.png differ diff --git a/dist/blog/md/one-step-diffusion-models.assets/image-20250503141422791.png b/dist/blog/md/one-step-diffusion-models.assets/image-20250503141422791.png new file mode 100644 index 0000000..da6aedf Binary files /dev/null and b/dist/blog/md/one-step-diffusion-models.assets/image-20250503141422791.png differ diff --git a/dist/blog/md/one-step-diffusion-models.assets/image-20250504142749208.png b/dist/blog/md/one-step-diffusion-models.assets/image-20250504142749208.png new file mode 100644 index 0000000..2d3efe8 Binary files /dev/null and b/dist/blog/md/one-step-diffusion-models.assets/image-20250504142749208.png differ diff --git a/dist/blog/md/one-step-diffusion-models.assets/image-20250504161430743.png b/dist/blog/md/one-step-diffusion-models.assets/image-20250504161430743.png new file mode 100644 index 0000000..8c5b463 Binary files /dev/null and b/dist/blog/md/one-step-diffusion-models.assets/image-20250504161430743.png differ diff --git a/dist/blog/md/one-step-diffusion-models.assets/image-20250504180714955.png b/dist/blog/md/one-step-diffusion-models.assets/image-20250504180714955.png new file mode 100644 index 0000000..55b3d0c Binary files /dev/null and b/dist/blog/md/one-step-diffusion-models.assets/image-20250504180714955.png differ diff --git a/dist/blog/md/one-step-diffusion-models.md b/dist/blog/md/one-step-diffusion-models.md new file mode 100644 index 0000000..42eebd2 --- /dev/null +++ b/dist/blog/md/one-step-diffusion-models.md @@ -0,0 +1,137 @@ +# One Step Diffusion Models + +Despite the promising performance of diffusion models on continuous modality generation, one deficiency that is holding them back is their requirement for multi-step denoising processes, which can be computationally expensive. In this article, we examine recent works that aim to build diffusion models capable of performing sampling in one or a few steps. + +--- + +# Background + +Diffusion models (DMs), or more broadly speaking, score-matching generative models, have become the de facto framework for building deep generation models. They demonstrate exceptional generation performance, especially on continuous modalities including images, videos, audios, and spatiotemporal data. + +Most diffusion models work by coupling a forward diffusion process and a reverse denoising diffusion process. The forward diffusion process gradually adds noise to the ground truth clean data $X_0$, until noisy data $X_T$ that follows a relatively simple distribution is reached. The reverse denoising diffusion process starts from the noisy data $X_T$, and removes the noise component step-by-step until clean generated data $X_0$ is reached. The reverse process is essentially a Monte-Carlo process, meaning it cannot be parallelized for each generation, which can be inefficient for a process with a large number of steps. + +![image-20250503125941212](one-step-diffusion-models.assets/image-20250503125941212.png) + +> The two processes in a typical diffusion model. *Source: Ho, Jain, and Abbeel, “Denoising Diffusion Probabilistic Models.”* + +## Understanding DMs + +There are many ways to understand how Diffusion Models (DMs) work. One of the most common and intuitive approaches is that a DM learns an ordinary differential equation (ODE) that transforms noise into data. Imagine an ODE vector field between the noise $X_T$ and clean data $X_0$. By training on sufficiently large numbers of timesteps $t\in [0,T]$, a DM is able to learn the vector (tangent) towards the cleaner data $X_{t-\Delta t}$, given any specific timestep $t$ and the corresponding noisy data $X_t$. This idea is easy to illustrate in a simplified 1-dimensional data scenario. + +![image-20250503132738122](one-step-diffusion-models.assets/image-20250503132738122.png) + +> Illustrated ODE flow of a diffusion model on 1-dimensional data. *Source: Song et al., “Score-Based Generative Modeling through Stochastic Differential Equations.”* It should be noted that as the figure suggests, there are differences between ODEs and DMs in a narrow sense. Flow matching models, a variant of DMs, more closely resemble ODEs. + +## DMs Scale Poorly with Few Steps + +Vanilla DDPM, which is essentially a discrete-timestep DM, can only perform the reverse process using the same number of steps it is trained on, typically thousands. DDIM introduces a reparameterization scheme that enables skipping steps during the reverse process of DDPM. Continuous-timestep DMs like Stochastic Differential Equations (SDE) naturally possess the capability of using fewer steps in the reverse process compared to the forward process/training. + +> Ho, Jain, and Abbeel, “Denoising Diffusion Probabilistic Models.” +> Song, Meng, and Ermon, “Denoising Diffusion Implicit Models.” +> Song et al., “Score-Based Generative Modeling through Stochastic Differential Equations.” + +Nevertheless, it is observed that their performance typically suffers catastrophic degradation when reducing the number of reverse process steps to single digits. + +![image-20250503135351246](one-step-diffusion-models.assets/image-20250503135351246.png) + +> Images generated by conventional DMs with only a few steps of reverse process. *Source: Frans et al., “One Step Diffusion via Shortcut Models.”* + +To understand why DMs scale poorly with few reverse process steps, we can return to the ODE vector field perspective of DMs. When the target data distribution is complex, the vector field typically contains numerous intersections. When a given $X_t$ and $t$ is at these intersections, the vector points to the averaged direction of all candidates. This causes the generated data to approach the mean of the training data when only a few reverse process steps are used. Another explanation is that the learned vector field is highly curved. Using only a few reverse process steps means attempting to approximate these curves with polylines, which is inherently difficult. + +![image-20250503141422791](one-step-diffusion-models.assets/image-20250503141422791.png) + +> Illustration of the why DMs scale poorly with few reverse process steps. *Source: Frans et al., “One Step Diffusion via Shortcut Models.”* + +We will introduce two branches of methods that aim to scale DMs to few or even reverse process steps: **distillation-based**, which distillates a pre-trained DM into a one-step model; and **end-to-end-based**, which trains a one-step DM from scratch. + +# Distallation + +Distillation-based methods are also called **rectified flow** methods. Their idea follows the above insight of "curved ODE vector field": if the curved vectors (flows) are hindering the scaling of reverse process steps, can we try to straighten these vectors so that they are easy to approximate with polylines or even straight lines? + +*Liu, Gong, and Liu, "Flow Straight and Fast"* implements this idea, focusing on learning an ODE that follows straight vectors as much as possible. In the context of continuous time DMs where $T=1$ and and $t\in[0,1]$, suppose the clean data $X_0$ and noise $X_1$ each follows a data distribution, $X_0\sim \pi_0$ and $X_1\sim \pi_1$. The "straight vectors" can be achieved by solving a nonlinear least squares optimization problem: +$$ +\min_{v} \int_{0}^{1} \mathbb{E}\left[\left\|\left(X_{1}-X_{0}\right)-v\left(X_{t}, t\right)\right\|^{2}\right] \mathrm{d} t, +$$ + +$$ +\quad X_{t}=t X_{1}+(1-t) X_{0} +$$ + + + +Where $v$ is the vector field of the ODE $dZ_t = v(Z_t,t)dt$. + +Though straightforward, when the clean data distribution $\pi_0$ is very complicated, the ideal result of completely straight vectors can be hard to achieve. To address this, a "reflow" procedure is introduced. This procedure iteratively trains new rectified flows using data generated by previously obtained flows: +$$ +Z^{(k+1)} = RectFlow((Z_0^k, Z_1^k)) +$$ +This procedure produces increasingly straight flows that can be simulated with very few steps, ideally one step after several iterations. + +![image-20250504142749208](one-step-diffusion-models.assets/image-20250504142749208.png) + +> Illustrations of vector fields after different times of reflow processes. *Source: Liu, Gong, and Liu, “Flow Straight and Fast.”* + +In practice, distillation-based methods are usually trained in two stages: first train a normal DM, and later distill one-step capabilities into it. This introduces additional computational overhead and complexity. + +# End-to-end + +Compared to distillation-based methods, end-to-end-based methods train a one-step-capable diffusion model (DM) within a single training run. Various techniques are used to implement such methods. We will focus on two of them: **consistency models** and **shortcut models**. + +## Consistency Models + +In discrete-timestep diffusion models (DMs), three components in the reverse denoising diffusion process are interchangeable through reparameterization: the noise component $\epsilon_t$ to remove, the less noisy previous step $x_{t-1}$, and the predicted clean sample $x_0$. This interchangeability is enabled by the following equation: +$$ +x_t = \sqrt{\bar{\alpha}_t} \, x_0 + \sqrt{1 - \bar{\alpha}_t} \, \epsilon_t +$$ +In theory, without altering the fundamental formulation of DMs, the learnable denoiser network can be designed to predict any of these three components. Consistency models (CMs) follow this principle by training the denoiser to specifically predict the clean sample $x_0$. The benefit of this approach is that CMs can naturally scale to perform the reverse process with few steps or even a single step. + +![image-20250504161430743](one-step-diffusion-models.assets/image-20250504161430743.png) + +> A consistency model that learns to map any point on the ODE trajectory to the clean sample. *Source: Song et al., “Consistency Models.”* + +Formally, CMs learn a function $f_\theta(x_t,t)$ that maps noisy data $x_t$ at time $t$ directly to the clean data $x_0$, satisfying: +$$ +f_\theta(x_t, t) = f_\theta(x_{t'}, t') \quad \forall t, t' +$$ +The model must also obey the differential consistency condition: +$$ +\frac{d}{dt} f_\theta(x_t, t) = 0 +$$ +CMs are trained by minimizing the discrepancy between outputs at adjacent times, with the loss function: +$$ +\mathcal{L} = \mathbb{E} \left[ d\left(f_\theta(x_t, t), f_\theta(x_{t'}, t')\right) \right] +$$ +Similar to continuous-timestep DMs and discrete-timestep DMs, CMs also have continuous-time and discrete-time variants. Discrete-time CMs are easier to train, but are more sensitive to timestep scheduling and suffer from discretization errors. Continuous-time CMs, on the other hand, suffer from instability during training. + +For a deeper discussion of the differences between the two variants of CMs, and how to stabilize continuous-time CMs, please refer to *Lu and Song, "Simplifying, Stabilizing and Scaling Continuous-Time Consistency Models."* + +## Shortcut Models + +Similar to distillation-based methods, the core idea of shortcut models is inspired by the "curved vector field" problem, but the shortcut models take a different approach to solve it. + +Shortcut models are introduced in *Frans et al., "One Step Diffusion via Shortcut Models."* The paper presents the insight that conventional DMs perform badly when jumping with large step sizes stems from their lack of awareness of the step size they are set to jump forward. Since they are only trained to comply with small step sizes, they are only learning the tangents in the curved vector field, not the "correct direction" when a large step size is used. + +Based on this insight, on top of $x_t$ and $t$, shortcut models additionally include step size $d$ as part of the condition for the denoiser network. At small step sizes ($d\rightarrow 0$), the model behaves like a standard flow-matching model, learning the expected tangent from noise to data. For larger step sizes, the model learns that one large step should equal two consecutive smaller steps (self-consistency), creating a binary recursive formulation. The model is trained by combining the standard flow matching loss when $d=0$ and the self-consistency loss when $d>0$: +$$ +\mathcal{L} = \mathbb{E} [ \underbrace{\| s_\theta(x_t, t, 0) - (x_1 - x_0)\|^2}_{\text{Flow-Matching}} + +$$ + +$$ +\underbrace{\|s_\theta(x_t, t, 2d) - \mathbf{s}_{\text{target}}\|^2}_{\text{Self-Consistency}}], +$$ + +$$ +\quad \mathbf{s}_{\text{target}} = s_\theta(x_t, t, d)/2 + s_\theta(x'_{t+d}, t + d, d)/2 \quad +$$ + +$$ +\text{and} \quad x'_{t+d} = x_t + s_\theta(x_t, t, d)d +$$ + + + +![image-20250504180714955](one-step-diffusion-models.assets/image-20250504180714955.png) + +> Illustration of the training process of shortcut models. *Source: Frans et al., “One Step Diffusion via Shortcut Models.”* + +Both consistency models and shortcut models can be seamlessly scaled between one-step and multi-step generation to balance quality and efficiency. diff --git a/dist/blog/template.html b/dist/blog/template.html new file mode 100644 index 0000000..93b3379 --- /dev/null +++ b/dist/blog/template.html @@ -0,0 +1,116 @@ + + + + + + + Yan Lin's Blog - {{ title }} + + + + + + + + + + +

+ +

+ {{ content }} +

+ + + + + \ No newline at end of file diff --git a/dist/fonts/Abril_Fatface/AbrilFatface-Regular.ttf b/dist/fonts/Abril_Fatface/AbrilFatface-Regular.ttf new file mode 100644 index 0000000..a291711 Binary files /dev/null and b/dist/fonts/Abril_Fatface/AbrilFatface-Regular.ttf differ diff --git a/dist/fonts/Abril_Fatface/OFL.txt b/dist/fonts/Abril_Fatface/OFL.txt new file mode 100644 index 0000000..94cd88d --- /dev/null +++ b/dist/fonts/Abril_Fatface/OFL.txt @@ -0,0 +1,94 @@ +Copyright (c) 2011, TypeTogether (www.type-together.com), +with Reserved Font Names "Abril" and "Abril Fatface" + +This Font Software is licensed under the SIL Open Font License, Version 1.1. +This license is copied below, and is also available with a FAQ at: +https://openfontlicense.org + + +----------------------------------------------------------- +SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007 +----------------------------------------------------------- + +PREAMBLE +The goals of the Open Font License (OFL) are to stimulate worldwide +development of collaborative font projects, to support the font creation +efforts of academic and linguistic communities, and to provide a free and +open framework in which fonts may be shared and improved in partnership +with others. + +The OFL allows the licensed fonts to be used, studied, modified and +redistributed freely as long as they are not sold by themselves. The +fonts, including any derivative works, can be bundled, embedded, +redistributed and/or sold with any software provided that any reserved +names are not used by derivative works. The fonts and derivatives, +however, cannot be released under any other type of license. The +requirement for fonts to remain under this license does not apply +to any document created using the fonts or their derivatives. + +DEFINITIONS +"Font Software" refers to the set of files released by the Copyright +Holder(s) under this license and clearly marked as such. This may +include source files, build scripts and documentation. + +"Reserved Font Name" refers to any names specified as such after the +copyright statement(s). + +"Original Version" refers to the collection of Font Software components as +distributed by the Copyright Holder(s). + +"Modified Version" refers to any derivative made by adding to, deleting, +or substituting -- in part or in whole -- any of the components of the +Original Version, by changing formats or by porting the Font Software to a +new environment. + +"Author" refers to any designer, engineer, programmer, technical +writer or other person who contributed to the Font Software. + +PERMISSION & CONDITIONS +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Font Software, to use, study, copy, merge, embed, modify, +redistribute, and sell modified and unmodified copies of the Font +Software, subject to the following conditions: + +1) Neither the Font Software nor any of its individual components, +in Original or Modified Versions, may be sold by itself. + +2) Original or Modified Versions of the Font Software may be bundled, +redistributed and/or sold with any software, provided that each copy +contains the above copyright notice and this license. These can be +included either as stand-alone text files, human-readable headers or +in the appropriate machine-readable metadata fields within text or +binary files as long as those fields can be easily viewed by the user. + +3) No Modified Version of the Font Software may use the Reserved Font +Name(s) unless explicit written permission is granted by the corresponding +Copyright Holder. This restriction only applies to the primary font name as +presented to the users. + +4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font +Software shall not be used to promote, endorse or advertise any +Modified Version, except to acknowledge the contribution(s) of the +Copyright Holder(s) and the Author(s) or with their explicit written +permission. + +5) The Font Software, modified or unmodified, in part or in whole, +must be distributed entirely under this license, and must not be +distributed under any other license. The requirement for fonts to +remain under this license does not apply to any document created +using the Font Software. + +TERMINATION +This license becomes null and void if any of the above conditions are +not met. + +DISCLAIMER +THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE +COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL +DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM +OTHER DEALINGS IN THE FONT SOFTWARE. diff --git a/dist/fonts/Domine/Domine-VariableFont_wght.ttf b/dist/fonts/Domine/Domine-VariableFont_wght.ttf new file mode 100644 index 0000000..2fe5de9 Binary files /dev/null and b/dist/fonts/Domine/Domine-VariableFont_wght.ttf differ diff --git a/dist/fonts/Domine/OFL.txt b/dist/fonts/Domine/OFL.txt new file mode 100644 index 0000000..b0c6a98 --- /dev/null +++ b/dist/fonts/Domine/OFL.txt @@ -0,0 +1,97 @@ +Copyright 2020 The Domine Project Authors (https://github.com/googlefonts/domine) +Copyright (c) 2012, Pablo Impallari (www.impallari.com|impallari@gmail.com), +Copyright (c) 2012, Pablo Impallari (www.impallari.com|impallari@gmail.com), +Copyright (c) 2012, Rodrigo Fuenzalida (www.rfuenzalida.com|hello@rfuenzalida.com), +Copyright (c) 2012, Brenda Gallo (gbrenda1987@gmail.com), with Reserved Font Name Domine. + +This Font Software is licensed under the SIL Open Font License, Version 1.1. +This license is copied below, and is also available with a FAQ at: +https://openfontlicense.org + + +----------------------------------------------------------- +SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007 +----------------------------------------------------------- + +PREAMBLE +The goals of the Open Font License (OFL) are to stimulate worldwide +development of collaborative font projects, to support the font creation +efforts of academic and linguistic communities, and to provide a free and +open framework in which fonts may be shared and improved in partnership +with others. + +The OFL allows the licensed fonts to be used, studied, modified and +redistributed freely as long as they are not sold by themselves. The +fonts, including any derivative works, can be bundled, embedded, +redistributed and/or sold with any software provided that any reserved +names are not used by derivative works. The fonts and derivatives, +however, cannot be released under any other type of license. The +requirement for fonts to remain under this license does not apply +to any document created using the fonts or their derivatives. + +DEFINITIONS +"Font Software" refers to the set of files released by the Copyright +Holder(s) under this license and clearly marked as such. This may +include source files, build scripts and documentation. + +"Reserved Font Name" refers to any names specified as such after the +copyright statement(s). + +"Original Version" refers to the collection of Font Software components as +distributed by the Copyright Holder(s). + +"Modified Version" refers to any derivative made by adding to, deleting, +or substituting -- in part or in whole -- any of the components of the +Original Version, by changing formats or by porting the Font Software to a +new environment. + +"Author" refers to any designer, engineer, programmer, technical +writer or other person who contributed to the Font Software. + +PERMISSION & CONDITIONS +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Font Software, to use, study, copy, merge, embed, modify, +redistribute, and sell modified and unmodified copies of the Font +Software, subject to the following conditions: + +1) Neither the Font Software nor any of its individual components, +in Original or Modified Versions, may be sold by itself. + +2) Original or Modified Versions of the Font Software may be bundled, +redistributed and/or sold with any software, provided that each copy +contains the above copyright notice and this license. These can be +included either as stand-alone text files, human-readable headers or +in the appropriate machine-readable metadata fields within text or +binary files as long as those fields can be easily viewed by the user. + +3) No Modified Version of the Font Software may use the Reserved Font +Name(s) unless explicit written permission is granted by the corresponding +Copyright Holder. This restriction only applies to the primary font name as +presented to the users. + +4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font +Software shall not be used to promote, endorse or advertise any +Modified Version, except to acknowledge the contribution(s) of the +Copyright Holder(s) and the Author(s) or with their explicit written +permission. + +5) The Font Software, modified or unmodified, in part or in whole, +must be distributed entirely under this license, and must not be +distributed under any other license. The requirement for fonts to +remain under this license does not apply to any document created +using the Font Software. + +TERMINATION +This license becomes null and void if any of the above conditions are +not met. + +DISCLAIMER +THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE +COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL +DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM +OTHER DEALINGS IN THE FONT SOFTWARE. diff --git a/dist/fonts/Domine/README.txt b/dist/fonts/Domine/README.txt new file mode 100644 index 0000000..a295872 --- /dev/null +++ b/dist/fonts/Domine/README.txt @@ -0,0 +1,66 @@ +Domine Variable Font +==================== + +This download contains Domine as both a variable font and static fonts. + +Domine is a variable font with this axis: + wght + +This means all the styles are contained in a single file: + Domine/Domine-VariableFont_wght.ttf + +If your app fully supports variable fonts, you can now pick intermediate styles +that aren’t available as static fonts. Not all apps support variable fonts, and +in those cases you can use the static font files for Domine: + Domine/static/Domine-Regular.ttf + Domine/static/Domine-Medium.ttf + Domine/static/Domine-SemiBold.ttf + Domine/static/Domine-Bold.ttf + +Get started +----------- + +1. Install the font files you want to use + +2. Use your app's font picker to view the font family and all the +available styles + +Learn more about variable fonts +------------------------------- + + https://developers.google.com/web/fundamentals/design-and-ux/typography/variable-fonts + https://variablefonts.typenetwork.com + https://medium.com/variable-fonts + +In desktop apps + + https://theblog.adobe.com/can-variable-fonts-illustrator-cc + https://helpx.adobe.com/nz/photoshop/using/fonts.html#variable_fonts + +Online + + https://developers.google.com/fonts/docs/getting_started + https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_Fonts/Variable_Fonts_Guide + https://developer.microsoft.com/en-us/microsoft-edge/testdrive/demos/variable-fonts + +Installing fonts + + MacOS: https://support.apple.com/en-us/HT201749 + Linux: https://www.google.com/search?q=how+to+install+a+font+on+gnu%2Blinux + Windows: https://support.microsoft.com/en-us/help/314960/how-to-install-or-remove-a-font-in-windows + +Android Apps + + https://developers.google.com/fonts/docs/android + https://developer.android.com/guide/topics/ui/look-and-feel/downloadable-fonts + +License +------- +Please read the full license text (OFL.txt) to understand the permissions, +restrictions and requirements for usage, redistribution, and modification. + +You can use them in your products & projects – print or digital, +commercial or otherwise. + +This isn't legal advice, please consider consulting a lawyer and see the full +license for all details. diff --git a/dist/fonts/Domine/static/Domine-Bold.ttf b/dist/fonts/Domine/static/Domine-Bold.ttf new file mode 100644 index 0000000..a9b5438 Binary files /dev/null and b/dist/fonts/Domine/static/Domine-Bold.ttf differ diff --git a/dist/fonts/Domine/static/Domine-Medium.ttf b/dist/fonts/Domine/static/Domine-Medium.ttf new file mode 100644 index 0000000..6c8b046 Binary files /dev/null and b/dist/fonts/Domine/static/Domine-Medium.ttf differ diff --git a/dist/fonts/Domine/static/Domine-Regular.ttf b/dist/fonts/Domine/static/Domine-Regular.ttf new file mode 100644 index 0000000..99a461e Binary files /dev/null and b/dist/fonts/Domine/static/Domine-Regular.ttf differ diff --git a/dist/fonts/Domine/static/Domine-SemiBold.ttf b/dist/fonts/Domine/static/Domine-SemiBold.ttf new file mode 100644 index 0000000..08c3b0b Binary files /dev/null and b/dist/fonts/Domine/static/Domine-SemiBold.ttf differ diff --git a/dist/fonts/Lato/Lato-Black.ttf b/dist/fonts/Lato/Lato-Black.ttf new file mode 100644 index 0000000..4340502 Binary files /dev/null and b/dist/fonts/Lato/Lato-Black.ttf differ diff --git a/dist/fonts/Lato/Lato-BlackItalic.ttf b/dist/fonts/Lato/Lato-BlackItalic.ttf new file mode 100644 index 0000000..4df1555 Binary files /dev/null and b/dist/fonts/Lato/Lato-BlackItalic.ttf differ diff --git a/dist/fonts/Lato/Lato-Bold.ttf b/dist/fonts/Lato/Lato-Bold.ttf new file mode 100644 index 0000000..016068b Binary files /dev/null and b/dist/fonts/Lato/Lato-Bold.ttf differ diff --git a/dist/fonts/Lato/Lato-BoldItalic.ttf b/dist/fonts/Lato/Lato-BoldItalic.ttf new file mode 100644 index 0000000..a05d503 Binary files /dev/null and b/dist/fonts/Lato/Lato-BoldItalic.ttf differ diff --git a/dist/fonts/Lato/Lato-Italic.ttf b/dist/fonts/Lato/Lato-Italic.ttf new file mode 100644 index 0000000..0d0f69e Binary files /dev/null and b/dist/fonts/Lato/Lato-Italic.ttf differ diff --git a/dist/fonts/Lato/Lato-Light.ttf b/dist/fonts/Lato/Lato-Light.ttf new file mode 100644 index 0000000..dfa72ce Binary files /dev/null and b/dist/fonts/Lato/Lato-Light.ttf differ diff --git a/dist/fonts/Lato/Lato-LightItalic.ttf b/dist/fonts/Lato/Lato-LightItalic.ttf new file mode 100644 index 0000000..12f2b6c Binary files /dev/null and b/dist/fonts/Lato/Lato-LightItalic.ttf differ diff --git a/dist/fonts/Lato/Lato-Regular.ttf b/dist/fonts/Lato/Lato-Regular.ttf new file mode 100644 index 0000000..bb2e887 Binary files /dev/null and b/dist/fonts/Lato/Lato-Regular.ttf differ diff --git a/dist/fonts/Lato/Lato-Thin.ttf b/dist/fonts/Lato/Lato-Thin.ttf new file mode 100644 index 0000000..ba58da1 Binary files /dev/null and b/dist/fonts/Lato/Lato-Thin.ttf differ diff --git a/dist/fonts/Lato/Lato-ThinItalic.ttf b/dist/fonts/Lato/Lato-ThinItalic.ttf new file mode 100644 index 0000000..4d82766 Binary files /dev/null and b/dist/fonts/Lato/Lato-ThinItalic.ttf differ diff --git a/dist/fonts/Lato/OFL.txt b/dist/fonts/Lato/OFL.txt new file mode 100644 index 0000000..cb1d5af --- /dev/null +++ b/dist/fonts/Lato/OFL.txt @@ -0,0 +1,93 @@ +Copyright (c) 2010-2014 by tyPoland Lukasz Dziedzic (team@latofonts.com) with Reserved Font Name "Lato" + +This Font Software is licensed under the SIL Open Font License, Version 1.1. +This license is copied below, and is also available with a FAQ at: +https://openfontlicense.org + + +----------------------------------------------------------- +SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007 +----------------------------------------------------------- + +PREAMBLE +The goals of the Open Font License (OFL) are to stimulate worldwide +development of collaborative font projects, to support the font creation +efforts of academic and linguistic communities, and to provide a free and +open framework in which fonts may be shared and improved in partnership +with others. + +The OFL allows the licensed fonts to be used, studied, modified and +redistributed freely as long as they are not sold by themselves. The +fonts, including any derivative works, can be bundled, embedded, +redistributed and/or sold with any software provided that any reserved +names are not used by derivative works. The fonts and derivatives, +however, cannot be released under any other type of license. The +requirement for fonts to remain under this license does not apply +to any document created using the fonts or their derivatives. + +DEFINITIONS +"Font Software" refers to the set of files released by the Copyright +Holder(s) under this license and clearly marked as such. This may +include source files, build scripts and documentation. + +"Reserved Font Name" refers to any names specified as such after the +copyright statement(s). + +"Original Version" refers to the collection of Font Software components as +distributed by the Copyright Holder(s). + +"Modified Version" refers to any derivative made by adding to, deleting, +or substituting -- in part or in whole -- any of the components of the +Original Version, by changing formats or by porting the Font Software to a +new environment. + +"Author" refers to any designer, engineer, programmer, technical +writer or other person who contributed to the Font Software. + +PERMISSION & CONDITIONS +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Font Software, to use, study, copy, merge, embed, modify, +redistribute, and sell modified and unmodified copies of the Font +Software, subject to the following conditions: + +1) Neither the Font Software nor any of its individual components, +in Original or Modified Versions, may be sold by itself. + +2) Original or Modified Versions of the Font Software may be bundled, +redistributed and/or sold with any software, provided that each copy +contains the above copyright notice and this license. These can be +included either as stand-alone text files, human-readable headers or +in the appropriate machine-readable metadata fields within text or +binary files as long as those fields can be easily viewed by the user. + +3) No Modified Version of the Font Software may use the Reserved Font +Name(s) unless explicit written permission is granted by the corresponding +Copyright Holder. This restriction only applies to the primary font name as +presented to the users. + +4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font +Software shall not be used to promote, endorse or advertise any +Modified Version, except to acknowledge the contribution(s) of the +Copyright Holder(s) and the Author(s) or with their explicit written +permission. + +5) The Font Software, modified or unmodified, in part or in whole, +must be distributed entirely under this license, and must not be +distributed under any other license. The requirement for fonts to +remain under this license does not apply to any document created +using the Font Software. + +TERMINATION +This license becomes null and void if any of the above conditions are +not met. + +DISCLAIMER +THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE +COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL +DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM +OTHER DEALINGS IN THE FONT SOFTWARE. diff --git a/dist/fonts/georgia/georgia.ttf b/dist/fonts/georgia/georgia.ttf new file mode 100644 index 0000000..43672d8 Binary files /dev/null and b/dist/fonts/georgia/georgia.ttf differ diff --git a/dist/fonts/georgia/georgiab.ttf b/dist/fonts/georgia/georgiab.ttf new file mode 100644 index 0000000..f460970 Binary files /dev/null and b/dist/fonts/georgia/georgiab.ttf differ diff --git a/dist/fonts/georgia/georgiai.ttf b/dist/fonts/georgia/georgiai.ttf new file mode 100644 index 0000000..c9f60e7 Binary files /dev/null and b/dist/fonts/georgia/georgiai.ttf differ diff --git a/dist/fonts/georgia/georgiaz.ttf b/dist/fonts/georgia/georgiaz.ttf new file mode 100644 index 0000000..124d291 Binary files /dev/null and b/dist/fonts/georgia/georgiaz.ttf differ diff --git a/dist/index.css b/dist/index.css new file mode 100644 index 0000000..650f743 --- /dev/null +++ b/dist/index.css @@ -0,0 +1,245 @@ +/* Font declarations */ +@font-face { + font-family: 'Lato'; + src: url('/fonts/Lato/Lato-Regular.ttf') format('truetype'); + font-weight: normal; + font-style: normal; + font-display: swap; +} + +@font-face { + font-family: 'Lato'; + src: url('/fonts/Lato/Lato-Bold.ttf') format('truetype'); + font-weight: bold; + font-style: normal; + font-display: swap; +} + +@font-face { + font-family: 'Domine'; + src: url('/fonts/Domine/static/Domine-Regular.ttf') format('truetype'); + font-weight: normal; + font-style: normal; + font-display: swap; +} + +@font-face { + font-family: 'Domine'; + src: url('/fonts/Domine/static/Domine-Bold.ttf') format('truetype'); + font-weight: 700; + font-style: normal; + font-display: swap; +} + +@font-face { + font-family: 'Abril Fatface'; + src: url('/fonts/Abril_Fatface/AbrilFatface-Regular.ttf') format('truetype'); + font-weight: normal; + font-style: normal; + font-display: swap; +} + +@font-face { + font-family: 'Georgia'; + src: url('/fonts/georgia/georgia.ttf') format('truetype'); + font-weight: normal; + font-style: normal; + font-display: swap; +} + +@font-face { + font-family: 'Georgia'; + src: url('/fonts/georgia/georgiab.ttf') format('truetype'); + font-weight: bold; + font-style: normal; + font-display: swap; +} + +@font-face { + font-family: 'Georgia'; + src: url('/fonts/georgia/georgiai.ttf') format('truetype'); + font-weight: normal; + font-style: italic; + font-display: swap; +} + +@font-face { + font-family: 'Georgia'; + src: url('/fonts/georgia/georgiaz.ttf') format('truetype'); + font-weight: bold; + font-style: italic; + font-display: swap; +} + +:root { + --main-font-family: Georgia, "Times New Roman", serif; + + /* Light mode variables */ + --background-color: #fff; + --background-secondary: #f8f9fa; + --text-color: #212529; + --text-secondary: #6c757d; + --border-color: #dee2e6; + --shadow-color: rgba(0, 0, 0, 0.15); + --primary-text: #58151c; + --secondary-text: #052c65; + --link-hover-color: #555; +} + +@media (prefers-color-scheme: dark) { + :root { + /* Dark mode variables */ + --background-color: #212529; + --background-secondary: #343a40; + --text-color: #f8f9fa; + --text-secondary: #adb5bd; + --border-color: #495057; + --shadow-color: rgba(0, 0, 0, 0.5); + --primary-text: #ffddb3; + --secondary-text: #c6e2ff; + --link-hover-color: #ddd; + } +} + +html, body { + height: 100%; + margin: 0; +} + +body { + font-family: var(--main-font-family); + background-color: var(--background-color); + color: var(--text-color); + display: flex; + flex-direction: column; + min-height: 100vh; +} + +/* Make main content grow to push footer down */ +main.container { + flex: 1 0 auto; +} + +/* Dark mode overrides for Bootstrap components */ +@media (prefers-color-scheme: dark) { + .bg-body-secondary { + background-color: var(--background-secondary) !important; + } + + .text-body-emphasis { + color: var(--text-color) !important; + } + + .border, .border-bottom { + border-color: var(--border-color) !important; + } + + .link-secondary { + color: var(--text-secondary) !important; + } + + .shadow-sm, .shadow { + box-shadow: 0 .125rem .25rem var(--shadow-color) !important; + } + + .btn-light { + background-color: var(--background-secondary); + color: var(--text-color); + border-color: var(--border-color); + } + + .list-group-flush .list-group-item { + background-color: transparent; + color: var(--text-color); + border-color: var(--border-color); + } + + .text-muted { + color: var(--text-secondary) !important; + } + + .figure-caption { + color: var(--text-secondary) !important; + } +} + +.link { + font-family: 'Lato', monospace; + color: var(--link-color); +} + +.link:hover { + color: var(--link-hover-color); +} + +.blog-link { + color: var(--link-color); +} + +.blog-link:hover { + color: var(--link-hover-color); +} + +.section { + margin-top: 2rem; + margin-bottom: 2rem; +} + +.paper-container { + padding: .8rem; +} + +.paper-title { + font-size: calc(1.0rem + 0.1vw); + font-weight: 500; +} + +.paper-link { + font-size: calc(0.7rem + 0.1vw); +} + +.venue-name { + font-size: calc(0.85rem + 0.1vw); + font-weight: 500; +} + +.author-name, .project-desc, .tldr { + font-size: calc(0.7rem + 0.1vw); +} + +.primary-text { + color: var(--primary-text); +} + +.secondary-text { + color: var(--secondary-text); +} + +.blog-title { + font-family: 'Domine', serif; + font-weight: 700; +} + +blockquote { + border-left: 4px solid var(--border-color); + margin: 1.5em 0; + padding: 0.5em 1em; + background-color: var(--background-secondary); +} + +blockquote p { + margin: 0; +} + +@media (prefers-color-scheme: dark) { + blockquote { + border-left-color: var(--border-color); + } +} + +footer { + margin-top: 0rem; + padding: 1rem 0; + width: 100%; + flex-shrink: 0; +} \ No newline at end of file diff --git a/dist/index.html b/dist/index.html new file mode 100644 index 0000000..67e26dd --- /dev/null +++ b/dist/index.html @@ -0,0 +1,746 @@ + + + + + + + Yan Lin's Homepage + + + + + + + + +

+ +

+ + + + + +

+ +

+ + + + + +

+ + + +

+ + +

Biography - Yan Lin

+ + +

+ +

Presentations

+ View All +

+ +

+ Guest lecture | Aalborg University +

+ + Slides + +

Self-supervised Learning of Trajectory Data

+ +

+ Workshop presentation | KDD 2024 +

+ + Slides + + Paper + +

PLM4Traj: Leveraging Pre-trained Language Models for Cognizing Movement Patterns and Travel Purposes from Trajectories

+ +

+ Paper Oral | SIGMOD 2024 +

+ + Slides + +

Origin-Destination Travel Time Oracle for Map-based Services

+ +

+ Tutorial | SpatialDI 2024 +

+ + Slides + +

Self-supervised Learning of Spatial-temporal Trajectories

+ +

+ Paper Oral | AAAI 2021 +

+ + Slides + +

Pre-training Context and Time Aware Location Embeddings from Spatial-Temporal Trajectories for User Next Location Prediction

+ +

Services

IEEE, ACM member
Secretary of IEEE (Denmark Section) Computer Society
Reviewer for journals including TIST, TII, and TVT
Member of program committees of ICLR, KDD, AAAI, CVPR, ICCV, IJCAI, and WWW

+ +

Blog

+ View All +

+ +

+ One Step Diffusion Models | May 2025 +

+ +

+ Multi-modal and Multi-function Transformers | April 2025 +

+ +

+ + + + + + + + + + + + + \ No newline at end of file diff --git a/dist/logo.webp b/dist/logo.webp new file mode 100644 index 0000000..a5da535 Binary files /dev/null and b/dist/logo.webp differ diff --git a/dist/presentations/index.html b/dist/presentations/index.html new file mode 100644 index 0000000..d79f735 --- /dev/null +++ b/dist/presentations/index.html @@ -0,0 +1,158 @@ + + + + + + + Yan Lin's Presentations + + + + + + + + +

+ +

+ + + + + +

+ +

+ + + +

+ + +

+ + +

Yan Lin, Huaiyu Wan, Shengnan Guo, Youfang Lin

+ + +

+ +

Yuxin Jia, Youfang Lin, Xinyan Hao, Yan Lin, Shengnan Guo, Huaiyu Wan

+ + +

+ +

+ * Equal Contribution +

+ +

+ + + + + + + + + + + \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..36f8ddb --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,15 @@ +services: + homepage: + image: nginx:alpine + container_name: homepage + ports: + - "9000:80" + volumes: + - ./dist:/usr/share/nginx/html + restart: unless-stopped + networks: + - proxy-network + +networks: + proxy-network: + external: true diff --git a/generate.py b/generate.py new file mode 100644 index 0000000..dd166ec --- /dev/null +++ b/generate.py @@ -0,0 +1,33 @@ +import os +import yaml +from jinja2 import Environment, FileSystemLoader + + +if __name__ == '__main__': + with open('data.yaml', 'r') as file: + profile_data = yaml.safe_load(file) + + env = Environment(loader=FileSystemLoader('templates')) + + os.makedirs('dist', exist_ok=True) + os.makedirs('dist/publications', exist_ok=True) + os.makedirs('dist/projects', exist_ok=True) + os.makedirs('dist/presentations', exist_ok=True) + os.makedirs('dist/blog', exist_ok=True) + os.makedirs('dist/blog/html', exist_ok=True) + + def render_template(template_name, output_path, **kwargs): + template = env.get_template(template_name) + html = template.render(**kwargs) + + with open(output_path, 'w') as file: + file.write(html) + + print(f'Generated {output_path}') + + render_template('index.html', 'dist/index.html', data=profile_data, is_home_page=True) + render_template('publications.html', 'dist/publications/index.html', data=profile_data, is_home_page=False) + render_template('projects.html', 'dist/projects/index.html', data=profile_data, is_home_page=False) + render_template('presentations.html', 'dist/presentations/index.html', data=profile_data, is_home_page=False) + render_template('blog.html', 'dist/blog/index.html', data=profile_data, is_home_page=False) + print('Static site generation complete!') diff --git a/parser/md.py b/parser/md.py new file mode 100644 index 0000000..a759ca2 --- /dev/null +++ b/parser/md.py @@ -0,0 +1,168 @@ +import markdown +import re +import os +import glob +from typing import List + + +def markdown_to_html_paragraphs(markdown_text: str) -> List[str]: + """ + Convert markdown text into a list of HTML paragraphs. + Supports mathematical equations using LaTeX syntax. + + Args: + markdown_text (str): The markdown text to convert + + Returns: + List[str]: A list of HTML paragraphs, each wrapped in

tags + """ + # Prepend "md/" to image paths if they don't already start with md/ + markdown_text = re.sub(r'!\[(.*?)\]$(?!md/)([^/].*?\.assets/.*?)$', r'![\1](/blog/md/\2)', markdown_text) + + # Check if the first line starts with a # for h1 title + lines = markdown_text.split('\n') + has_h1_title = False + bold_title = None + + if lines and lines[0].strip().startswith('#'): + has_h1_title = True + title_line = lines[0].strip().lstrip('#').strip() + bold_title = f'

{title_line}

' + # Remove the title from the markdown to avoid duplicate processing + markdown_text = '\n'.join(lines[1:]) + else: + raise ValueError("No title found in the markdown file") + + # Configure markdown with math extensions + extensions = [ + 'markdown.extensions.extra', # For blockquotes and other features + 'markdown.extensions.fenced_code', # For code blocks + 'markdown.extensions.codehilite', # For syntax highlighting + 'markdown.extensions.attr_list', # For attributes + 'markdown.extensions.md_in_html', # For markdown inside HTML + 'mdx_math', # For math support + ] + + try: + # Try to use python-markdown-math which outputs compatible with MathJax 3 + import pymdownx.arithmatex + extensions.remove('mdx_math') + extensions.append('pymdownx.arithmatex') + extension_configs = { + 'pymdownx.arithmatex': { + 'generic': True # Uses $...$ for inline and \[...\] for display math + } + } + except ImportError: + # Fallback to mdx_math + extension_configs = { + 'mdx_math': { + 'enable_dollar_delimiter': True, # Enable $...$ for inline math + } + } + + # Convert markdown to HTML with math support + html = markdown.markdown( + markdown_text, + extensions=extensions, + extension_configs=extension_configs + ) + + html = re.sub(r'

\s*(]+>)\s*

', r'\1', html, flags=re.IGNORECASE) + # Convert image followed by blockquote to figure with caption + html = re.sub( + r']+)>\s*

\s*
(.*?)
\s*

', + r'

\n \n

\n', + html, + flags=re.DOTALL + ) + + # Add "link" class and target="_blank" to all tags + html = re.sub(r'', r'', html) + html = re.sub(r'', r'', html) + html = re.sub(r'', r'', html) + + # Split the HTML into paragraphs + paragraphs = html.split('\n\n') + + # Clean up and ensure each paragraph is properly wrapped + cleaned_paragraphs = [] + + # Add the bold title as the first element if it exists + if has_h1_title and bold_title: + cleaned_paragraphs.append(bold_title) + + for p in paragraphs: + p = p.strip() + if p: + # If the paragraph doesn't already have

tags, add them + if not (p.startswith('<') and not p.startswith('

')): + p = f'

{p}

' + cleaned_paragraphs.append(p) + + return cleaned_paragraphs, title_line + + +def insert_markdown_into_template(template_path: str, markdown_text: str) -> str: + """ + Insert parsed markdown content into the template HTML file. + + Args: + template_path (str): Path to the template HTML file + markdown_text (str): The markdown text to convert and insert + + Returns: + str: Complete HTML with markdown content inserted + """ + # Parse markdown into HTML paragraphs + html_paragraphs, title_line = markdown_to_html_paragraphs(markdown_text) + + # Read the template + with open(template_path, 'r') as f: + template = f.read() + + # Join paragraphs into a single string + content_html = '\n'.join(html_paragraphs) + + # Insert the content into the template + complete_html = template.replace('{{ content }}', content_html) + + # Replace {{ title }} placeholders with the extracted title + complete_html = complete_html.replace('{{ title }}', title_line) + + return complete_html + + +def process_all_markdown_files(): + """ + Process all markdown files in blog/md/ directory and generate HTML files in blog/html/. + """ + # Get all markdown files in blog/md/ + md_files = glob.glob("dist/blog/md/*.md") + template_path = "dist/blog/template.html" + + for md_file in md_files: + # Extract base filename without extension + base_name = os.path.basename(md_file)[:-3] # Remove .md extension + html_file = f"dist/blog/html/{base_name}.html" + + print(f"Processing {md_file} -> {html_file}") + + try: + # Read the markdown content + with open(md_file, "r") as f: + markdown_text = f.read() + + # Generate HTML content + complete_html = insert_markdown_into_template(template_path, markdown_text) + + # Write HTML output + with open(html_file, "w") as f: + f.write(complete_html) + + except Exception as e: + print(f"Error processing {md_file}: {str(e)}") + + +if __name__ == "__main__": + process_all_markdown_files() \ No newline at end of file diff --git a/preproduction.sh b/preproduction.sh new file mode 100644 index 0000000..89a7e72 --- /dev/null +++ b/preproduction.sh @@ -0,0 +1,4 @@ +python parser/md.py +python generate.py +cd dist +python -m http.server 8000 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e70e63c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +markdown>=3.4.0 +python-markdown-math>=0.8 +pyyaml>=6.0.2 +jinja2>=3.1.6 \ No newline at end of file diff --git a/sync.sh b/sync.sh new file mode 100644 index 0000000..66d6ab4 --- /dev/null +++ b/sync.sh @@ -0,0 +1,4 @@ +python parser/md.py +python generate.py +rsync -avP --delete ./dist/ hetzner:~/homepage/dist +rsync -avP ./docker-compose.yml hetzner:~/homepage/ \ No newline at end of file diff --git a/templates/base.html b/templates/base.html new file mode 100644 index 0000000..fb80af9 --- /dev/null +++ b/templates/base.html @@ -0,0 +1,85 @@ + + + + + + + {% block title %}Yan Lin{% endblock %} + + + + + {% block extra_head %}{% endblock %} + + + +

+ {% if is_home_page %} +

+ {% else %} +

+ {% endif %} +

+ {% block header_left %} + {% if is_home_page %} + + {% else %} + + {% endif %} + {% endblock %} +

+ +

+ {% block header_right %} + {% if is_home_page %} + + {% endif %} + {% endblock %} +

+ {% block navigation %}{% endblock %} +

+ + {% block content %}{% endblock %} +

+ + + + + + {% block extra_js %} + + {% endblock %} + + + \ No newline at end of file diff --git a/templates/blog.html b/templates/blog.html new file mode 100644 index 0000000..954c14e --- /dev/null +++ b/templates/blog.html @@ -0,0 +1,18 @@ +{% extends 'base.html' %} + +{% block title %}Yan Lin's Blog{% endblock %} + +{% block header_title %}Yan Lin's Blog{% endblock %} + +{% block navigation %} +{% endblock %} + +{% block content %} +

+ {% for blog in data.blogs %} + {% include 'partials/blog.html' %} + {% endfor %} +

+{% endblock %} \ No newline at end of file diff --git a/templates/index.html b/templates/index.html new file mode 100644 index 0000000..4bcc887 --- /dev/null +++ b/templates/index.html @@ -0,0 +1,113 @@ +{% extends 'base.html' %} + +{% block title %}Yan Lin's Homepage{% endblock %} + +{% block navigation %} + {% include 'partials/navigation.html' %} +{% endblock %} + +{% block content %} +

Biography - Yan Lin

+ +

Publications

+ View All +

+ {% for pub in data.primaryPublications[:10] %} + {% with type='primary' %} + {% include 'partials/publication.html' %} + {% endwith %} + {% endfor %} +

+ {% for pub in data.secondaryPublications[:10] %} + {% with type='secondary' %} + {% include 'partials/publication.html' %} + {% endwith %} + {% endfor %} +

+ * Equal Contribution +

+ +

Projects

+ View All +

+ {% for project in data.primaryProjects[:3] %} + {% with type='primary' %} + {% include 'partials/project.html' %} + {% endwith %} + {% endfor %} +

+ {% for project in data.secondaryProjects[:3] %} + {% with type='secondary' %} + {% include 'partials/project.html' %} + {% endwith %} + {% endfor %} +

+ +

Presentations

+ View All +

+ {% for presentation in data.presentations[:5] %} + {% include 'partials/presentation.html' %} + {% endfor %} +

+ +

Services

{{ service }}

+ +

Blog

+ View All +

+ {% for blog in data.blogs[:3] %} + {% include 'partials/blog.html' %} + {% endfor %} +

+{% endblock %} + +{% block extra_js %} + {{ super() }} +{% endblock %} \ No newline at end of file diff --git a/templates/partials/blog.html b/templates/partials/blog.html new file mode 100644 index 0000000..8f1756c --- /dev/null +++ b/templates/partials/blog.html @@ -0,0 +1,4 @@ +

+ {{ blog.title }} | {{ blog.badge }} +

\ No newline at end of file diff --git a/templates/partials/navigation.html b/templates/partials/navigation.html new file mode 100644 index 0000000..83b6961 --- /dev/null +++ b/templates/partials/navigation.html @@ -0,0 +1,16 @@ + \ No newline at end of file diff --git a/templates/partials/presentation.html b/templates/partials/presentation.html new file mode 100644 index 0000000..d24f943 --- /dev/null +++ b/templates/partials/presentation.html @@ -0,0 +1,13 @@ +

+ {{ presentation.tags|join(" | ")|safe }} +

+ {% for name, url in presentation.links.items() %} + {{ name }} + {% endfor %} +

\ No newline at end of file diff --git a/templates/partials/project.html b/templates/partials/project.html new file mode 100644 index 0000000..24d2e07 --- /dev/null +++ b/templates/partials/project.html @@ -0,0 +1,14 @@ +

+ {{ project.tags|join(" | ")|safe }} +

+ {% for name, url in project.links.items() %} + {{ name }} + {% endfor %} +

\ No newline at end of file diff --git a/templates/partials/publication.html b/templates/partials/publication.html new file mode 100644 index 0000000..6c2d806 --- /dev/null +++ b/templates/partials/publication.html @@ -0,0 +1,14 @@ +

+ {{ pub.tags|join(" | ")|safe }} +

+ {% for name, url in pub.links.items() %} + {{ name }} + {% endfor %} +

\ No newline at end of file diff --git a/templates/presentations.html b/templates/presentations.html new file mode 100644 index 0000000..9b6658c --- /dev/null +++ b/templates/presentations.html @@ -0,0 +1,18 @@ +{% extends 'base.html' %} + +{% block title %}Yan Lin's Presentations{% endblock %} + +{% block header_title %}Yan Lin's Presentations{% endblock %} + +{% block navigation %} +{% endblock %} + +{% block content %} +

+ {% for presentation in data.presentations %} + {% include 'partials/presentation.html' %} + {% endfor %} +

+{% endblock %} \ No newline at end of file diff --git a/templates/projects.html b/templates/projects.html new file mode 100644 index 0000000..835253a --- /dev/null +++ b/templates/projects.html @@ -0,0 +1,31 @@ +{% extends 'base.html' %} + +{% block title %}Yan Lin's Projects{% endblock %} + +{% block header_title %}Yan Lin's Projects{% endblock %} + +{% block navigation %} + +{% endblock %} + +{% block content %} +

Primary Projects

+ {% for project in data.primaryProjects %} + {% with type='primary' %} + {% include 'partials/project.html' %} + {% endwith %} + {% endfor %} +

+ +

Secondary Projects

+ {% for project in data.secondaryProjects %} + {% with type='secondary' %} + {% include 'partials/project.html' %} + {% endwith %} + {% endfor %} +

+{% endblock %} \ No newline at end of file diff --git a/templates/publications.html b/templates/publications.html new file mode 100644 index 0000000..6b2ae9d --- /dev/null +++ b/templates/publications.html @@ -0,0 +1,35 @@ +{% extends 'base.html' %} + +{% block title %}Yan Lin's Publications{% endblock %} + +{% block header_title %}Yan Lin's Publications{% endblock %} + +{% block navigation %} + +{% endblock %} + +{% block content %} +

Primary Publications

+ {% for pub in data.primaryPublications %} + {% with type='primary' %} + {% include 'partials/publication.html' %} + {% endwith %} + {% endfor %} +

+ +

Secondary Publications

+ {% for pub in data.secondaryPublications %} + {% with type='secondary' %} + {% include 'partials/publication.html' %} + {% endwith %} + {% endfor %} +

+ +

+ * Equal Contribution +

+{% endblock %} \ No newline at end of file