First commit

This commit is contained in:
Yan Lin 2025-05-13 10:48:42 +02:00
commit ed9566d057
76 changed files with 4005 additions and 0 deletions

277
data.yaml Normal file
View file

@ -0,0 +1,277 @@
primaryPublications:
- title: "UVTM: Universal Vehicle Trajectory Modeling with ST Feature Domain Generation"
authors: "Yan Lin, Jilin Hu, Shengnan Guo, Bin Yang, Christian S. Jensen, Youfang Lin, Huaiyu Wan"
tags:
- "IEEE TKDE"
- "2025"
links:
Preprint: "https://arxiv.org/abs/2402.07232"
Code: "https://github.com/Logan-Lin/UVTM"
- title: "TrajCogn: Leveraging LLMs for Cognizing Movement Patterns and Travel Purposes from Trajectories"
authors: "Zeyu Zhou*, <strong>Yan Lin*</strong>, Haomin Wen, Shengnan Guo, Jilin Hu, Youfang Lin, Huaiyu Wan"
tags:
- "IJCAI"
- "2025"
links:
Preprint: "https://arxiv.org/abs/2405.12459"
Code: "https://github.com/Zeru19/PLM4Traj"
- title: "UniTE: A Survey and Unified Pipeline for Pre-training Spatiotemporal Trajectory Embeddings"
authors: "<strong>Yan Lin</strong>, Zeyu Zhou, Yicheng Liu, Haochen Lv, Haomin Wen, Tianyi Li, Yushuai Li, Christian S. Jensen, Shengnan Guo, Youfang Lin, Huaiyu Wan"
tags:
- "IEEE TKDE"
- "2025"
links:
Paper: "https://ieeexplore.ieee.org/document/10818577"
Preprint: "https://arxiv.org/abs/2407.12550"
Code: "https://github.com/Logan-Lin/UniTE"
- title: "Path-LLM: A Multi-Modal Path Representation Learning by Aligning and Fusing with Large Language Models"
authors: "Yongfu Wei*, <strong>Yan Lin*</strong>, Hongfan Gao, Ronghui Xu, Sean Bin Yang, Jilin Hu"
tags:
- "WWW"
- "2025"
links:
Paper: "https://openreview.net/forum?id=KmMSQS6tFn"
Code: "https://github.com/decisionintelligence/Path-LLM"
- title: "DutyTTE: Deciphering Uncertainty in Origin-Destination Travel Time Estimation"
authors: "Xiaowei Mao*, <strong>Yan Lin*</strong>, Shengnan Guo, Yubin Chen, Xingyu Xian, Haomin Wen, Qisen Xu, Youfang Lin, Huaiyu Wan"
tags:
- "AAAI"
- "2025"
links:
Preprint: "https://arxiv.org/abs/2408.12809"
- title: "Mobility-LLM: Learning Visiting Intentions and Travel Preference from Human Mobility Data with Large Language Models"
authors: "Letian Gong*, <strong>Yan Lin*</strong>, Xinyue Zhang, Yiwen Lu, Xuedi Han, Yichen Liu, Shengnan Guo, Youfang Lin, Huaiyu Wan"
tags:
- "NeurIPS"
- "2024"
links:
Paper: "https://openreview.net/forum?id=0feJEykDRx"
Poster: "https://neurips.cc/virtual/2024/poster/96914"
- title: "Origin-Destination Travel Time Oracle for Map-based Services"
authors: "<strong>Yan Lin</strong>, Huaiyu Wan, Jilin Hu, Shengnan Guo, Bin Yang, Christian S. Jensen, Youfang Lin"
tags:
- "SIGMOD"
- "2024"
links:
Paper: "https://dl.acm.org/doi/10.1145/3617337"
Preprint: "https://arxiv.org/abs/2307.03048"
Code: "https://github.com/Logan-Lin/DOT"
- title: "Pre-training General Trajectory Embeddings with Maximum Multi-view Entropy Coding"
authors: "<strong>Yan Lin</strong>, Huaiyu Wan, Shengnan Guo, Jilin Hu, Christian S. Jensen, Youfang Lin"
tags:
- "IEEE TKDE"
- "2023"
links:
Paper: "https://ieeexplore.ieee.org/abstract/document/10375102"
Preprint: "https://arxiv.org/abs/2207.14539"
Code: "https://github.com/Logan-Lin/MMTEC"
- title: "Pre-training Time-aware location embeddings from spatial-temporal trajectories"
authors: "Huaiyu Wan, <strong>Yan Lin</strong>, Shengnan Guo, Youfang Lin"
tags:
- "IEEE TKDE"
- "2022"
links:
Paper: "https://ieeexplore.ieee.org/abstract/document/9351627"
Code: "https://github.com/Logan-Lin/TALE"
- title: "Pre-training Context and Time Aware Location Embeddings from Spatial-Temporal Trajectories for User Next Location Prediction"
authors: "<strong>Yan Lin</strong>, Huaiyu Wan, Shengnan Guo, Youfang Lin"
tags:
- "AAAI"
- "2021"
links:
Paper: "https://ojs.aaai.org/index.php/AAAI/article/view/16548"
Code: "https://github.com/Logan-Lin/CTLE"
secondaryPublications:
- title: "DUET: Dual Clustering Enhanced Multivariate Time Series Forecasting"
authors: "Xiangfei Qiu, Xingjian Wu, <strong>Yan Lin</strong>, Chenjuan Guo, Jilin Hu, Bin Yang"
tags:
- "KDD"
- "2025"
links:
Preprint: "https://arxiv.org/abs/2412.10859"
Code: "https://github.com/decisionintelligence/DUET"
- title: "Diff-RNTraj: A Structure-aware Diffusion Model for Road Network-constrained Trajectory Generation"
authors: "Tonglong Wei, Youfang Lin, Shengnan Guo, <strong>Yan Lin</strong>, Yiheng Huang, Chenyang Xiang, Yuqing Bai, Menglu Ya, Huaiyu Wan"
tags:
- "IEEE TKDE"
- "2024"
links:
Paper: "https://www.computer.org/csdl/journal/tk/5555/01/10679607/20b3hlbjBOo"
Preprint: "https://arxiv.org/abs/2402.07369"
Code: "https://github.com/wtl52656/Diff-RNTraj"
- title: "STCDM: Spatio-Temporal Contrastive Diffusion Model for Check-In Sequence Generation"
authors: "Letian Gong, Shengnan Guo, <strong>Yan Lin</strong>, Yichen Liu, Erwen Zheng, Yiwei Shuang, Youfang Lin, Jilin Hu, Huaiyu Wan"
tags:
- "IEEE TKDE"
- "2024"
links:
Paper: "https://ieeexplore.ieee.org/document/10836764"
- title: "Micro-Macro Spatial-Temporal Graph-based Encoder-Decoder for Map-Constrained Trajectory Recovery"
authors: "Tonglong Wei, Youfang Lin, <strong>Yan Lin</strong>, Shengnan Guo, Lan Zhang, Huaiyu Wan"
tags:
- "IEEE TKDE"
- "2024"
links:
Paper: "https://www.computer.org/csdl/journal/tk/5555/01/10517676/1WCj0j0FljW"
Preprint: "https://arxiv.org/abs/2404.19141"
Code: "https://github.com/wtl52656/MM-STGED"
- title: "Inductive and Adaptive Graph Convolution Networks Equipped with Constraint Task for Spatial-Temporal Traffic Data Kriging"
authors: "Tonglong Wei, Youfang Lin, Shengnan Guo, <strong>Yan Lin</strong>, Yiji Zhao, Xiyuan Jin, Zhihao Wu, Huaiyu Wan"
tags:
- "KBS"
- "2024"
links:
Paper: "https://www.sciencedirect.com/science/article/pii/S0950705123010730"
Code: "https://github.com/wtl52656/IAGCN"
- title: "Spatial-Temporal Cross-View Contrastive Pre-Training for Check-in Sequence Representation Learning"
authors: "Letian Gong, Huaiyu Wan, Shengnan Guo, Li Xiucheng, <strong>Yan Lin</strong>, Erwen Zheng, Tianyi Wang, Zeyu Zhou, Youfang Lin"
tags:
- "IEEE TKDE"
- "2024"
links:
Preprint: "https://arxiv.org/abs/2407.15899"
- title: "Contrastive Pre-training with Adversarial Perturbations for Check-In Sequence Representation Learning"
authors: "Letian Gong, Youfang Lin, Shengnan Guo, <strong>Yan Lin</strong>, Tianyi Wang, Erwen Zheng, Zeyu Zhou, Huaiyu Wan"
tags:
- "AAAI"
- "2023"
links:
Paper: "https://ojs.aaai.org/index.php/AAAI/article/view/25546"
Code: "https://github.com/LetianGong/CACSR"
- title: "Adversarial Self-Attentive Time-Variant Neural Networks for Multi-Step Time Series Forecasting"
authors: "Changxia Gao, Ning Zhang, Youru Li, <strong>Yan Lin</strong>, Huaiyu Wan"
tags:
- "ESWA"
- "2023"
links:
Paper: "https://www.sciencedirect.com/science/article/pii/S0957417423012241"
- title: "Multi-scale Adaptive Attention-based Time-Variant Neural Networks for Multi-step Time Series Forecasting"
authors: "Changxia Gao, Ning Zhang, Youru Li, <strong>Yan Lin</strong>, Huaiyu Wan"
tags:
- "APIN"
- "2023"
links:
Paper: "https://link.springer.com/article/10.1007/s10489-023-05057-7"
- title: "WITRAN: Water-wave Information Transmission and Recurrent Acceleration Network for Long-range Time Series Forecasting"
authors: "Yuxin Jia, Youfang Lin, Xinyan Hao, <strong>Yan Lin</strong>, Shengnan Guo, Huaiyu Wan"
tags:
- "NeurIPS"
- "2023"
links:
Paper: "https://openreview.net/forum?id=y08bkEtNBK"
Code: "https://github.com/Water2sea/WITRAN"
primaryProjects:
- title: 'Research on <i>Prediction of User Travel Destination and Travel Time Based on Trajectory Representation Learning</i>'
tags:
- "Fundamental Research Funds for the Central Universities of China"
desc: "Applies representation learning to trajectory data to transform original features into high-level information, improving the performance of downstream tasks such as travel time and destination prediction."
links: {}
- title: 'Development of <i>OverleafCopilot - Empowering Academic Writing in Overleaf with Large Language Models</i>'
tags:
- "Personal Interest Project"
desc: "This project aims to develop a Browser extension to seamlessly integrate Large Language Models (such as ChatGPT) into the popular online academic writing platform, Overleaf."
links:
Home: "https://www.overleafcopilot.com/"
Install: "https://chromewebstore.google.com/detail/overleaf-copilot/eoadabdpninlhkkbhngoddfjianhlghb"
- title: 'Development of <i>PromptGenius - All-purpose prompts for LLMs</i>'
tags:
- "Personal Interest Project"
desc: "This project focuses on developing a website that offers a wide range of prompt categories, enhancing the versatility of LLMs for various tasks and improving their output quality."
links:
Website: "https://www.promptgenius.site/"
Code: "https://github.com/wenhaomin/ChatGPT-PromptGenius"
secondaryProjects:
- title: 'Research on <i>Inverse Design of Materials Using Diffusion Probabilistic Models</i>'
tags:
- "Villum Foundation"
desc: "This project focuses on developing diffusion probabilistic models to first understand the relationship between chemistry/structure and material properties, then enable the inverse design of new materials with specific properties. This project currently supports my postdoctoral research position."
links: {}
- title: 'Research on <i>Pre-training Representation Learning Methods of Spatial-temporal Trajectory Data for Traffic Prediction</i>'
tags:
- "National Natural Science Foundation of China"
desc: "This project aims to propose pre-training representation learning methods for spatial-temporal trajectory data, modeling multiple features to improve traffic prediction tasks. It demonstrates how trajectory representation learning can enhance traffic data mining."
links: {}
- title: 'Research on <i>Spatial-temporal Trajectory Generation and Representation Learning Methods for Sparsity Problems</i>'
tags:
- "National Natural Science Foundation of China"
desc: "This project explores how to generate high-quality spatial-temporal trajectory data and corresponding representations to address sparsity-related issues, thereby supporting a variety of downstream tasks."
links: {}
presentations:
- title: 'Self-supervised Learning of Trajectory Data'
tags:
- "Guest lecture"
- "Aalborg University"
links:
Slides: "/assets/Self-supervised Learning of Trajectory Data.pdf"
- title: 'PLM4Traj: Leveraging Pre-trained Language Models for Cognizing Movement Patterns and Travel Purposes from Trajectories'
tags:
- "Workshop presentation"
- "KDD 2024"
links:
Slides: "/assets/KDD_2024_Workshop_PLM4Traj.pdf"
Paper: "https://arxiv.org/abs/2405.12459"
- title: 'Origin-Destination Travel Time Oracle for Map-based Services'
tags:
- "Paper Oral"
- "SIGMOD 2024"
links:
Slides: "/assets/SIGMOD-Oral-PPT.pdf"
- title: 'Self-supervised Learning of Spatial-temporal Trajectories'
tags:
- "Tutorial"
- "SpatialDI 2024"
links:
Slides: "/assets/Talk on SpatialDI 2024.pdf"
- title: 'Pre-training Context and Time Aware Location Embeddings from Spatial-Temporal Trajectories for User Next Location Prediction'
tags:
- "Paper Oral"
- "AAAI 2021"
links:
Slides: "/assets/AAAI21 Oral PPT.pdf"
services:
- "IEEE, ACM member"
- "Secretary of IEEE (Denmark Section) Computer Society"
- "Reviewer for journals including TIST, TII, and TVT"
- "Member of program committees of ICLR, KDD, AAAI, CVPR, ICCV, IJCAI, and WWW"
blogs:
- title: "One Step Diffusion Models"
badge: "May 2025"
path: "one-step-diffusion-models"
tldr: "Despite the promising performance of diffusion models on continuous modality generation, one deficiency that is holding them back is their requirement for multi-step denoising processes, which can be computationally expensive. In this article, we examine recent works that aim to build diffusion models capable of performing sampling in one or a few steps."
- title: "Multi-modal and Multi-function Transformers"
badge: "April 2025"
path: "multi-modal-transformer"
tldr: "Multi-modal and multi-function Transformers enables a single architecture to process diverse data types such as language, images, and videos simultaneously. These models employ techniques like vector quantization and lookup-free quantization to map different modalities into a unified embedding space, allowing the Transformer to handle them within the same sequence. Beyond processing multiple data types, these architectures can also combine different functionalities-such as auto-regressive language generation and diffusion-based image creation-within a single model."

BIN
dist/assets/AAAI21 Oral PPT.pdf vendored Normal file

Binary file not shown.

Binary file not shown.

BIN
dist/assets/SIGMOD-Oral-PPT.pdf vendored Normal file

Binary file not shown.

Binary file not shown.

BIN
dist/assets/Talk on SpatialDI 2024.pdf vendored Normal file

Binary file not shown.

View file

@ -0,0 +1,229 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Yan Lin's Blog - Multi-modal and Multi-function Transformers</title>
<link rel="icon" href="/logo.webp" type="image/x-icon">
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.7.2/font/bootstrap-icons.css" rel="stylesheet">
<link rel="stylesheet" href="/index.css">
<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
<script>
MathJax = {
tex: {
inlineMath: [['$', '$'], ['\\(', '\\)']],
displayMath: [['$$', '$$'], ['\\[', '\\]']]
},
options: {
skipHtmlTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code'],
processHtmlClass: 'arithmatex'
}
};
</script>
<style>
a {
font-family: 'Lato', sans-serif;
}
img, .figure {
max-width: min(100%, 800px);
height: auto;
display: block;
margin-left: auto;
margin-right: auto;
}
.blog-title {
font-size: calc(1.35rem + 0.9vw);
font-weight: bold;
}
h1 {
font-size: calc(1.35rem + 0.6vw);
margin-top: 2rem;
}
h2 {
font-size: calc(1.1rem + 0.4vw);
margin-top: 1.5rem;
}
h3 {
font-size: calc(0.95rem + 0.1vw);
font-weight: bold;
margin-top: 1rem;
}
</style>
</head>
<body>
<div class="container">
<header class="border-bottom lh-1 py-3 border-secondary">
<div class="row flex-nowrap justify-content-between align-items-center">
<div class="col-2">
<a class="link-secondary header-icon px-2 h4" href="/"><i class="bi bi-house-fill"></i></a>
</div>
<div class="col-8 text-center">
<div class="page-header-logo h2 m-0 fw-bold" style="font-family: 'Abril Fatface', serif;">Yan Lin's Blog</div>
</div>
<div class="col-2 text-end">
<a class="link-secondary header-icon px-2 h4" href="/blog"><i class="bi bi-list-task"></i></a>
</div>
</div>
</header>
</div>
<main class="container">
<article class="section col-xl-10 col-xxl-9 mx-auto">
<p class="blog-title">Multi-modal and Multi-function Transformers</p>
<p><p>Transformers have gained immense popularity within deep learning and AI communities in recent years. Since their introduction in <em>Vaswani et al., "Attention Is All You Need"</em>, they have proven to be powerful sequential models across diverse domains, with thousands of variations and "improved versions." The rise of Large Language Models (LLMs), which largely use Transformers as their foundation, has led to another surge in research around this architecture. This trend has even led graph learning and Computer Vision (CV) communities to move beyond their established foundation models (i.e., GNNs and CNNs) and embrace Transformers. This explains the increasing prevalence of graph Transformers and image Transformers today.</p>
<blockquote>
<p>Han et al., “A Survey on Vision Transformer”; Khan et al., “Transformers in Vision”; Yun et al., “Graph Transformer Networks.”</p>
</blockquote>
<p>Beyond "chasing the trend," using Transformer as a unified foundation model offers several advantages:</p>
<ul>
<li>Transformers excel at capturing long-term dependencies. Unlike GNNs and CNNs which require deeper network structures for longer context, Transformers natively support global dependency modeling through their self-attention mechanism. They also avoid global smoothing and vanishing gradient problems that hinder context length scaling in other network architectures.</li>
<li>Transformers process sequences in parallel rather than sequentially, enabling full utilization of GPU acceleration. This advantage can be further enhanced with techniques like those described in <em>Dao et al., "FlashAttention."</em></li>
<li>Transformers are flexible network structures. They don't inherently enforce sequentiality—without positional encoding, the ordering of input steps to Transformers is equivalent. Through strategic permutation and positional encoding, Transformers can adapt to a wide range of structured and unstructured data.</li>
<li>The development of LLMs has made many open-weight Transformer models available with strong natural language understanding capabilities. These Transformers can be prompted and fine-tuned to model other modalities such as spatiotemporal data and images while retaining their language modeling abilities, creating opportunities for developing multi-modal foundation models.</li>
<li>From a practical perspective, using Transformer as a foundation allows reuse of technical infrastructure and optimizations developed over years, including efficient architecture designs, training pipelines, and specialized hardware.</li>
</ul>
<p>In this article, we will briefly explore techniques for unifying multiple modalities (e.g., natural language and images) and multiple functionalities (e.g., language models and diffusion denoisers) within a single Transformer. These techniques are largely sourced from recent oral papers presented at ICML, ICLR, and CVPR conferences. I assume readers have general knowledge of basic concepts in ML and neural networks, Transformers, LLMs, and diffusion models.</p>
<p>Since images and language modalities represent continuous and discrete data respectively, we will use them as examples throughout this article. Keep in mind that the techniques introduced can be readily extended to other modalities, including spatiotemporal data.</p>
<h1>General Goal</h1>
<p>The goal of a multi-modal Transformer is to create a model that can accept multi-modal inputs and produce multi-modal outputs. For example, instead of using a CNN-based image encoder and a Transformer-based language encoder to map image and language modalities to the latent space separately, a multi-modal Transformer would be able to process the combination of image and language (sentence) as a single sequence.</p>
<figure class="figure">
<img alt="image" src="/blog/md/multi-modal-transformer.assets/image.png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">An example of “conventional” multi-modal fusion. Different modality is processed by separate models and fused at some point. Source: <em>Xiang, Hao, Runsheng Xu, and Jiaqi Ma. "HM-ViT: Hetero-modal vehicle-to-vehicle cooperative perception with vision transformer." CVPR, 2023.</em></figcaption>
</figure>
<figure class="figure">
<img alt="image (1)" src="/blog/md/multi-modal-transformer.assets/image (1).png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">An example of a Transformer that can handle multi-modal inputs and outputs. Different modalities are all projected into tokens and subsequently processed by a unified Transformer encoder. Source: <em>Kondratyuk, Dan, Lijun Yu, et al. “VideoPoet: A Large Language Model for Zero-Shot Video Generation,” ICML, 2024.</em></figcaption>
</figure>
<p>Beyond multi-modal processing, a multi-function Transformer can, for example, function as both a language model (auto-regressive generation) and diffusion denoiser (score-matching generation) simultaneously, supporting two of the most common generation schemes used today.</p>
<h1>Modality Embedding</h1>
<p>A fundamental challenge in unifying multiple modalities within a single Transformer is how to represent different modalities in the same embedding space. For the "QKV" self-attention mechanism to work properly, each item in the input sequence must be represented by an embedding vector of the same dimension, matching the "model dimension" of the Transformer.</p>
<figure class="figure">
<img alt="image (2)" src="/blog/md/multi-modal-transformer.assets/image (2).png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">Illustration of the QKV self-attention mechanism in Transformer. <a href="https://en.wikipedia.org/wiki/Attention_(machine_learning)" class="link" target="_blank">Source</a></figcaption>
</figure>
<p>The most common method for mapping language into the embedding space is through tokenization and token embedding. A tokenizer maps a word or word fragment into a discrete token index, and an index-fetching embedding layer (implemented in frameworks like PyTorch with <code>nn.Embedding</code>) maps this index into a fixed-dimension embedding vector. In principle, all discrete features can be mapped into the embedding space using this approach.</p>
<figure class="figure">
<img alt="1_Dk1X5rmLomXqqTPeuHgBpw" src="/blog/md/multi-modal-transformer.assets/1_Dk1X5rmLomXqqTPeuHgBpw.png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">Visualization of tokenizer and index-fetching embedding layer. <a href="https://medium.com/@hunter-j-phillips/the-embedding-layer-27d9c980d124" class="link" target="_blank">Source</a></figcaption>
</figure>
<h2>Vector Quantization</h2>
<p>For continuous features, one intuitive approach is to first tokenize them into discrete tokens, thereby unifying the embedding process across both discrete and continuous features. <strong>Vector quantization</strong>, introduced in VQ-VAE, is one of the most common methods for this purpose.</p>
<blockquote>
<p>Van Den Oord, Aaron, and Oriol Vinyals. "Neural discrete representation learning." NeurIPS, 2017.</p>
</blockquote>
<p>Vector quantization maintains a "codebook" <span class="arithmatex">\(\boldsymbol C \in \mathbb R^{n\times d}\)</span>, which functions similarly to the index-fetching embedding layer, where <span class="arithmatex">\(n\)</span> is the total number of unique tokens, and <span class="arithmatex">\(d\)</span> is the embedding size. A given continuous vector <span class="arithmatex">\(\boldsymbol{z}\in\mathbb R^{d}\)</span> is quantized into a discrete value <span class="arithmatex">\(i\in\mathbb [0,n-1]\)</span> by finding the closest row vector in <span class="arithmatex">\(\boldsymbol C\)</span> to <span class="arithmatex">\(\boldsymbol{z}\)</span>, and that row vector <span class="arithmatex">\(\boldsymbol C_i\)</span> is fetched as the embedding for <span class="arithmatex">\(\boldsymbol{z}\)</span>. Formally:
$$
i = \arg\min_j ||\boldsymbol z - \boldsymbol C_j||₂
$$
<img alt="Screen_Shot_2020-06-28_at_4.26.40_PM" src="/blog/md/multi-modal-transformer.assets/Screen_Shot_2020-06-28_at_4.26.40_PM.png" /></p>
<h2>Lookup-Free Quantization</h2>
<p>A significant limitation of vector quantization is that it requires calculating distances between the given continuous vectors and the entire codebook, which becomes computationally expensive for large-scale codebooks. This creates tension with the need for expanded codebooks to represent complex modalities such as images and videos. Research has shown that simply increasing the number of unique tokens doesn't always improve codebook performance.</p>
<blockquote>
<p>“A simple trick for training a larger codebook involves decreasing the code embedding dimension when increasing the vocabulary size.” Source: <em>Yu, Lijun, Jose Lezama, et al. “Language Model Beats Diffusion - Tokenizer Is Key to Visual Generation,” ICLR, 2024.</em></p>
</blockquote>
<p>Building on this insight, <strong>Lookup-Free Quantization</strong> (LFQ) eliminates the embedding dimension of codebooks (essentially reducing the embedding dimension to 0) and directly calculates the discrete index <span class="arithmatex">\(i\)</span> by individually quantizing each dimension of <span class="arithmatex">\(\boldsymbol z\)</span> into a binary digit. The index <span class="arithmatex">\(i\)</span> can then be computed by converting the binary representation to decimal. Formally:
$$
i=\sum_{j=1}^{d} 2^{(j-1)}\cdot 𝟙(z_j &gt; 0)
$$</p>
<blockquote>
<p>For example, given a continuous vector <span class="arithmatex">\(\boldsymbol z=\langle -0.52, 1.50, 0.53, -1.32\rangle\)</span>, we first quantize each dimension into <span class="arithmatex">\(\langle 0, 1, 1, 0\rangle\)</span>, based on the sign of each dimension. The token index of <span class="arithmatex">\(\boldsymbol z\)</span> is simply the decimal equivalent of the binary 0110, which is 6.</p>
</blockquote>
<p>However, this approach introduces another challenge: we still need an index-fetching embedding layer to map these token indices into embedding vectors for the Transformer. This, combined with the typically large number of unique tokens when using LFQ—a 32-dimensional <span class="arithmatex">\(\boldsymbol z\)</span> will result in <span class="arithmatex">\(2^{32}=4,294,967,296\)</span> unique tokens—creates significant efficiency problems. One solution is to factorize the token space. Effectively, this means splitting the binary digits into multiple parts, embedding each part separately, and concatenating the resulting embedding vectors. For example, with a 32-dimensional <span class="arithmatex">\(\boldsymbol z\)</span>, if we quantize and embed its first and last 16 dimensions separately, we “only” need to handle <span class="arithmatex">\(2^{16}*2= 131,072\)</span> unique tokens.</p>
<p>Note that this section doesn't extensively explain how to map raw continuous features into the vector <span class="arithmatex">\(\boldsymbol{z}\)</span>, as these techniques are relatively straightforward and depend on the specific feature type—for example, fully-connected layers for numerical features, or CNN/GNN with feature flattening for structured data.</p>
<h2>Quantization over Linear Projection</h2>
<p>You might be asking—why can't we simply use linear projections to map the raw continuous features into the embedding space? What are the benefits of quantizing continuous features into discrete tokens?</p>
<p>Although Transformers are regarded as universal sequential models, they were designed for discrete tokens in their first introduction in <em>Vaswani et al., "Attention Is All You Need"</em>. Empirically, they have optimal performance when dealing with tokens, compared to continuous features. This is supported by many research papers claiming that quantizing continuous features improves the performance of Transformers, and works demonstrating Transformers' subpar performance when applied directly to continuous features.</p>
<blockquote>
<p>Mao, Chengzhi, Lu Jiang, Mostafa Dehghani, Carl Vondrick, Rahul Sukthankar, and Irfan Essa. “Discrete Representations Strengthen Vision Transformer Robustness,” ICLR, 2022.</p>
<p>Ilbert, Romain, Ambroise Odonnat, et al. “SAMformer: Unlocking the Potential of Transformers in Time Series Forecasting with Sharpness-Aware Minimization and Channel-Wise Attention,” ICML, 2024.</p>
</blockquote>
<p>On the other hand, unifying different modalities into tokens is especially beneficial in the context of Transformer-based "foundation models," since it preserves the auto-regressive next-token prediction architecture of LLMs. Combined with special tokens such as "start of sentence" and "end of sentence," the Transformer model is flexible in generating contents of mixed modalities with varied length.</p>
<blockquote>
<p>For example, by quantizing videos into discrete tokens and combining the token space of videos and language, one can create a unified Transformer model that generates both videos and language in one sequence. The start and end points of video and language sub-sequences are fully determined by the model, based on the specific input prompt. This structure would be difficult to replicate if we used tokenization for language but linear projection for videos.</p>
</blockquote>
<h1>Transformer Backbone</h1>
<p>After different modalities are mapped into the same embedding space, they can be arranged into a sequence of embedding vectors and input into a Transformer backbone. We don't discuss the variations of Transformer structure and improvement techniques here, as they are numerous, and ultimately function similarly as sequential models.</p>
<blockquote>
<p>Lan et al., “ALBERT”; Ye et al., “Differential Transformer”; Kitaev, Kaiser, and Levskaya, “Reformer”; Su et al., “RoFormer”; Dai et al., “Transformer-XL.”</p>
</blockquote>
<p>As we know, the "full" Transformer structure proposed in <em>Vaswani et al., "Attention Is All You Need"</em> includes an encoder and a decoder. They perform self-attention within their respective input sequences, and the decoder additionally performs cross-attention between its input sequence and the memory sequence derived from the encoder's output. Some early language models use encoder-only structure (like <em>Devlin et al., "BERT"</em>) focused on outputting embedding vectors or encoder-decoder structure (like <em>Chung et al., "Scaling Instruction-Finetuned Language Models"</em>) for generating natural language output. Most modern large language models and foundation models use decoder-only structure (like <em>Brown et al., "Language Models Are Few-Shot Learners"</em>), focusing on auto-regressive generation of language output.</p>
<p>The encoder-only structure theoretically excels at representation learning, and its produced embedding vectors can be applied to various downstream tasks. Recent developments have gradually moved towards decoder-only structure, centered around the idea of building models that are capable of directly generating the required final output of every downstream task.</p>
<blockquote>
<p>For example, to perform sentiment analysis, BERT will compute an embedding vector for the query sentence, and the embedding vector can be used in a dedicated classifier to predict the sentiment label. GPT, on the other hand, can directly answer the question "what is the sentiment associated with the query sentence?" Comparatively, GPT is more versatile in most cases and can easily perform zero-shot prediction.</p>
</blockquote>
<p>Nevertheless, representation learning is still a relevant topic. The general understanding is that decoder-only structure cannot perform conventional representation learning, for example mapping a sentence into a fixed-dimension embedding vector. Yet, there are a few works in the latest ICLR that shed light on the utilization of LLMs as representation learning or embedding models:</p>
<blockquote>
<p>Gao, Leo, Tom Dupre la Tour, Henk Tillman, Gabriel Goh, Rajan Troll, Alec Radford, Ilya Sutskever, Jan Leike, and Jeffrey Wu. “Scaling and Evaluating Sparse Autoencoders,” 2024. <a href="https://openreview.net/forum?id=tcsZt9ZNKD" class="link" target="_blank">Link</a></p>
<p>Li, Ziyue, and Tianyi Zhou. “Your Mixture-of-Experts LLM Is Secretly an Embedding Model for Free,” 2024. <a href="https://openreview.net/forum?id=eFGQ97z5Cd" class="link" target="_blank">Link</a></p>
<p>Zhang, Jie, Dongrui Liu, Chen Qian, Linfeng Zhang, Yong Liu, Yu Qiao, and Jing Shao. “REEF: Representation Encoding Fingerprints for Large Language Models,” 2024. <a href="https://openreview.net/forum?id=SnDmPkOJ0T" class="link" target="_blank">Link</a></p>
</blockquote>
<h1>Output Layer</h1>
<p>For language generation, Transformers typically use classifier output layers, mapping the latent vector of each item in the output sequence back to tokens. As we've established in the "modality embedding" section, the optimal method to embed continuous features is to quantize them into discrete tokens. Correspondingly, an intuitive method to output continuous features is to map these discrete tokens back to the continuous feature space, essentially reversing the vector quantization process.</p>
<h2>Reverse Vector Quantization</h2>
<p>One approach to reverse vector quantization is readily available in VQ-VAE, since it is an auto-encoder. Given a token <span class="arithmatex">\(i\)</span>, we can look up its embedding in the codebook as <span class="arithmatex">\(\boldsymbol C_i\)</span>, then apply a decoder network to map <span class="arithmatex">\(\boldsymbol C_i\)</span> back to the continuous feature vector <span class="arithmatex">\(\boldsymbol z\)</span>. The decoder network can be pre-trained in the VQ-VAE framework—pre-train the VQ-VAE tokenizer, encoder, and decoder using auto-encoding loss functions, or end-to-end trained along with the whole Transformer. In the NLP and CV communities, the pre-training approach is more popular, since there are many large-scale pre-trained auto-encoders available.</p>
<figure class="figure">
<img alt="image (4)" src="/blog/md/multi-modal-transformer.assets/image (4).png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">The encoder-decoder structure of MAGVIT (<em>Yu et al., “MAGVIT”</em>), a visual VQ-VAE model. A 3D-VQ encoder quantizes a video into discrete tokens, and a 3D-VQ decoder maps them back to the pixel space.</figcaption>
</figure>
<h2>Efficiency Enhancement</h2>
<p>For continuous feature generation, unlike language generation where the output tokens are the final output, we are essentially representing the final output with a limited size token space. Thus, for complicated continuous features like images and videos, we have to expand the token space or use more tokens to represent one image or one video frame to improve generation quality, which can result in efficiency challenges.</p>
<p>There are several workarounds to improve the efficiency of multi-modal outputs. One approach is to generate low-resolution outputs first, then use a separate super-resolution module to improve the quality of the output. This approach is explored in <em>Kondratyuk et al., "VideoPoet"</em> and <em>Tian et al., "Visual Autoregressive Modeling"</em>. Interestingly, the overall idea is very similar to nVidia's DLSS, where the graphics card renders a low-resolution frame (e.g., 1080p) using the conventional rasterization pipeline, then a super resolution model increases the frame's resolution (e.g., 4k) utilizing the graphics card's tensor hardware, improving games' overall frame rate.</p>
<p>Another workaround follows the idea of compression. Take video generation as an example. The model generates full features for key frames, and light-weight features for motion vectors that describe subtle differences from those key frames. This is essentially how inter-frame compressed video codecs work, which takes advantage of temporal redundancy between neighboring frames.</p>
<figure class="figure">
<img alt="image (5)" src="/blog/md/multi-modal-transformer.assets/image (5).png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">Keys frames and motion vectors used in <em>Jin et al., “Video-LaVIT.”</em></figcaption>
</figure>
<h1>Fuse with Diffusion Models</h1>
<p>Despite continuous efforts to enable representation and generation of images and videos with a language model structure (auto-regressive), current research indicates that diffusion models (more broadly speaking, score-matching generative models) outperform language models on continuous feature generation. Score-matching generative models have their own separate and substantial community, with strong theoretical foundations and numerous variations emerging each year, such as stochastic differential equations, bayesian flow, and rectified flow. In conclusion, score-matching generative models are clearly here to stay alongside language models.</p>
<p>An intriguing question arises: why not integrate the structures of language models and diffusion models into one Transformer to reach the best of both worlds? <em>Zhou et al. in "Transfusion"</em> explored this idea. The approach is straightforward: build a Transformer that can handle both language and image inputs and outputs. The language component functions as a language model, while the image component serves as a denoiser network for diffusion models. The model is trained by combining the language modeling loss and DDPM loss, enabling it to function either as a language model or a text-to-image denoiser.</p>
<figure class="figure">
<img alt="image (6)" src="/blog/md/multi-modal-transformer.assets/image (6).png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">A Transformer capable of function as a language model and a diffusion denoiser at the same time. Source: <em>Zhou, Chunting, Lili Yu, et al. “Transfusion: Predict the Next Token and Diffuse Images with One Multi-Modal Model,” ICLR, 2025.</em></figcaption>
</figure>
<h1>Conclusion</h1>
<p>In conclusion, the evolution of Transformers into versatile foundation models capable of handling multiple modalities and functionalities represents a significant advancement in AI research. By enabling a single architecture to process diverse data types through techniques like vector quantization and lookup-free quantization, researchers have created models that can seamlessly integrate language, images, and other modalities within the same embedding space.</p>
<p>In our research domain, we encounter even more diverse and domain-specific multi-modal data, such as traffic flows, trajectories, and real-world agent interactions. A unified Transformer for such data presents a promising solution for creating "foundation models" that generalize across diverse tasks and scenarios. However, domain-specific challenges, including data encoding and decoding, computational efficiency, and scalability, must be addressed to realize this potential.</p></p>
</article>
<p class="text-center text-secondary" style="font-size: 0.8rem; font-family: 'Lato', sans-serif;">Copyright © 2025. Designed and implemented by Yan Lin.</p>
</main>
<button id="back-to-top" class="btn btn-light rounded-circle" style="position: fixed; bottom: 20px; right: 20px; display: none; z-index: 1000; width: 40px; height: 40px; padding: 0;"><i class="bi bi-chevron-up"></i></button>
</body>
</html>
<script>
document.addEventListener('DOMContentLoaded', function() {
document.querySelectorAll('img').forEach(function(img) {
img.classList.add('figure-img', 'rounded');
});
});
// Show or hide the back-to-top button
window.addEventListener('scroll', function() {
var backToTopButton = document.getElementById('back-to-top');
if (window.scrollY > 100) {
backToTopButton.style.display = 'block';
} else {
backToTopButton.style.display = 'none';
}
});
// Scroll to top when the button is clicked
document.getElementById('back-to-top').addEventListener('click', function(e) {
e.preventDefault();
window.scrollTo({
top: 0,
behavior: 'smooth'
});
window.location.href = '#';
return false;
});
</script>

View file

@ -0,0 +1,216 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Yan Lin's Blog - One Step Diffusion Models</title>
<link rel="icon" href="/logo.webp" type="image/x-icon">
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.7.2/font/bootstrap-icons.css" rel="stylesheet">
<link rel="stylesheet" href="/index.css">
<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
<script>
MathJax = {
tex: {
inlineMath: [['$', '$'], ['\\(', '\\)']],
displayMath: [['$$', '$$'], ['\\[', '\\]']]
},
options: {
skipHtmlTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code'],
processHtmlClass: 'arithmatex'
}
};
</script>
<style>
a {
font-family: 'Lato', sans-serif;
}
img, .figure {
max-width: min(100%, 800px);
height: auto;
display: block;
margin-left: auto;
margin-right: auto;
}
.blog-title {
font-size: calc(1.35rem + 0.9vw);
font-weight: bold;
}
h1 {
font-size: calc(1.35rem + 0.6vw);
margin-top: 2rem;
}
h2 {
font-size: calc(1.1rem + 0.4vw);
margin-top: 1.5rem;
}
h3 {
font-size: calc(0.95rem + 0.1vw);
font-weight: bold;
margin-top: 1rem;
}
</style>
</head>
<body>
<div class="container">
<header class="border-bottom lh-1 py-3 border-secondary">
<div class="row flex-nowrap justify-content-between align-items-center">
<div class="col-2">
<a class="link-secondary header-icon px-2 h4" href="/"><i class="bi bi-house-fill"></i></a>
</div>
<div class="col-8 text-center">
<div class="page-header-logo h2 m-0 fw-bold" style="font-family: 'Abril Fatface', serif;">Yan Lin's Blog</div>
</div>
<div class="col-2 text-end">
<a class="link-secondary header-icon px-2 h4" href="/blog"><i class="bi bi-list-task"></i></a>
</div>
</div>
</header>
</div>
<main class="container">
<article class="section col-xl-10 col-xxl-9 mx-auto">
<p class="blog-title">One Step Diffusion Models</p>
<p><p>Despite the promising performance of diffusion models on continuous modality generation, one deficiency that is holding them back is their requirement for multi-step denoising processes, which can be computationally expensive. In this article, we examine recent works that aim to build diffusion models capable of performing sampling in one or a few steps.</p>
<hr />
<h1>Background</h1>
<p>Diffusion models (DMs), or more broadly speaking, score-matching generative models, have become the de facto framework for building deep generation models. They demonstrate exceptional generation performance, especially on continuous modalities including images, videos, audios, and spatiotemporal data.</p>
<p>Most diffusion models work by coupling a forward diffusion process and a reverse denoising diffusion process. The forward diffusion process gradually adds noise to the ground truth clean data <span class="arithmatex">\(X_0\)</span>, until noisy data <span class="arithmatex">\(X_T\)</span> that follows a relatively simple distribution is reached. The reverse denoising diffusion process starts from the noisy data <span class="arithmatex">\(X_T\)</span>, and removes the noise component step-by-step until clean generated data <span class="arithmatex">\(X_0\)</span> is reached. The reverse process is essentially a Monte-Carlo process, meaning it cannot be parallelized for each generation, which can be inefficient for a process with a large number of steps.</p>
<figure class="figure">
<img alt="image-20250503125941212" src="/blog/md/one-step-diffusion-models.assets/image-20250503125941212.png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">The two processes in a typical diffusion model. <em>Source: Ho, Jain, and Abbeel, “Denoising Diffusion Probabilistic Models.”</em></figcaption>
</figure>
<h2>Understanding DMs</h2>
<p>There are many ways to understand how Diffusion Models (DMs) work. One of the most common and intuitive approaches is that a DM learns an ordinary differential equation (ODE) that transforms noise into data. Imagine an ODE vector field between the noise <span class="arithmatex">\(X_T\)</span> and clean data <span class="arithmatex">\(X_0\)</span>. By training on sufficiently large numbers of timesteps <span class="arithmatex">\(t\in [0,T]\)</span>, a DM is able to learn the vector (tangent) towards the cleaner data <span class="arithmatex">\(X_{t-\Delta t}\)</span>, given any specific timestep <span class="arithmatex">\(t\)</span> and the corresponding noisy data <span class="arithmatex">\(X_t\)</span>. This idea is easy to illustrate in a simplified 1-dimensional data scenario.</p>
<figure class="figure">
<img alt="image-20250503132738122" src="/blog/md/one-step-diffusion-models.assets/image-20250503132738122.png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">Illustrated ODE flow of a diffusion model on 1-dimensional data. <em>Source: Song et al., “Score-Based Generative Modeling through Stochastic Differential Equations.”</em> It should be noted that as the figure suggests, there are differences between ODEs and DMs in a narrow sense. Flow matching models, a variant of DMs, more closely resemble ODEs.</figcaption>
</figure>
<h2>DMs Scale Poorly with Few Steps</h2>
<p>Vanilla DDPM, which is essentially a discrete-timestep DM, can only perform the reverse process using the same number of steps it is trained on, typically thousands. DDIM introduces a reparameterization scheme that enables skipping steps during the reverse process of DDPM. Continuous-timestep DMs like Stochastic Differential Equations (SDE) naturally possess the capability of using fewer steps in the reverse process compared to the forward process/training.</p>
<blockquote>
<p>Ho, Jain, and Abbeel, “Denoising Diffusion Probabilistic Models.”
Song, Meng, and Ermon, “Denoising Diffusion Implicit Models.”
Song et al., “Score-Based Generative Modeling through Stochastic Differential Equations.”</p>
</blockquote>
<p>Nevertheless, it is observed that their performance typically suffers catastrophic degradation when reducing the number of reverse process steps to single digits.</p>
<figure class="figure">
<img alt="image-20250503135351246" src="/blog/md/one-step-diffusion-models.assets/image-20250503135351246.png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">Images generated by conventional DMs with only a few steps of reverse process. <em>Source: Frans et al., “One Step Diffusion via Shortcut Models.”</em></figcaption>
</figure>
<p>To understand why DMs scale poorly with few reverse process steps, we can return to the ODE vector field perspective of DMs. When the target data distribution is complex, the vector field typically contains numerous intersections. When a given <span class="arithmatex">\(X_t\)</span> and <span class="arithmatex">\(t\)</span> is at these intersections, the vector points to the averaged direction of all candidates. This causes the generated data to approach the mean of the training data when only a few reverse process steps are used. Another explanation is that the learned vector field is highly curved. Using only a few reverse process steps means attempting to approximate these curves with polylines, which is inherently difficult.</p>
<figure class="figure">
<img alt="image-20250503141422791" src="/blog/md/one-step-diffusion-models.assets/image-20250503141422791.png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">Illustration of the why DMs scale poorly with few reverse process steps. <em>Source: Frans et al., “One Step Diffusion via Shortcut Models.”</em></figcaption>
</figure>
<p>We will introduce two branches of methods that aim to scale DMs to few or even reverse process steps: <strong>distillation-based</strong>, which distillates a pre-trained DM into a one-step model; and <strong>end-to-end-based</strong>, which trains a one-step DM from scratch.</p>
<h1>Distallation</h1>
<p>Distillation-based methods are also called <strong>rectified flow</strong> methods. Their idea follows the above insight of "curved ODE vector field": if the curved vectors (flows) are hindering the scaling of reverse process steps, can we try to straighten these vectors so that they are easy to approximate with polylines or even straight lines?</p>
<p><em>Liu, Gong, and Liu, "Flow Straight and Fast"</em> implements this idea, focusing on learning an ODE that follows straight vectors as much as possible. In the context of continuous time DMs where <span class="arithmatex">\(T=1\)</span> and and <span class="arithmatex">\(t\in[0,1]\)</span>, suppose the clean data <span class="arithmatex">\(X_0\)</span> and noise <span class="arithmatex">\(X_1\)</span> each follows a data distribution, <span class="arithmatex">\(X_0\sim \pi_0\)</span> and <span class="arithmatex">\(X_1\sim \pi_1\)</span>. The "straight vectors" can be achieved by solving a nonlinear least squares optimization problem:
$$
\min_{v} \int_{0}^{1} \mathbb{E}\left[\left|\left(X_{1}-X_{0}\right)-v\left(X_{t}, t\right)\right|^{2}\right] \mathrm{d} t,
$$</p>
<div class="arithmatex">\[
\quad X_{t}=t X_{1}+(1-t) X_{0}
\]</div>
<p>Where <span class="arithmatex">\(v\)</span> is the vector field of the ODE <span class="arithmatex">\(dZ_t = v(Z_t,t)dt\)</span>.</p>
<p>Though straightforward, when the clean data distribution <span class="arithmatex">\(\pi_0\)</span> is very complicated, the ideal result of completely straight vectors can be hard to achieve. To address this, a "reflow" procedure is introduced. This procedure iteratively trains new rectified flows using data generated by previously obtained flows:
$$
Z^{(k+1)} = RectFlow((Z_0^k, Z_1^k))
$$
This procedure produces increasingly straight flows that can be simulated with very few steps, ideally one step after several iterations.</p>
<figure class="figure">
<img alt="image-20250504142749208" src="/blog/md/one-step-diffusion-models.assets/image-20250504142749208.png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">Illustrations of vector fields after different times of reflow processes. <em>Source: Liu, Gong, and Liu, “Flow Straight and Fast.”</em></figcaption>
</figure>
<p>In practice, distillation-based methods are usually trained in two stages: first train a normal DM, and later distill one-step capabilities into it. This introduces additional computational overhead and complexity.</p>
<h1>End-to-end</h1>
<p>Compared to distillation-based methods, end-to-end-based methods train a one-step-capable diffusion model (DM) within a single training run. Various techniques are used to implement such methods. We will focus on two of them: <strong>consistency models</strong> and <strong>shortcut models</strong>.</p>
<h2>Consistency Models</h2>
<p>In discrete-timestep diffusion models (DMs), three components in the reverse denoising diffusion process are interchangeable through reparameterization: the noise component <span class="arithmatex">\(\epsilon_t\)</span> to remove, the less noisy previous step <span class="arithmatex">\(x_{t-1}\)</span>, and the predicted clean sample <span class="arithmatex">\(x_0\)</span>. This interchangeability is enabled by the following equation:
$$
x_t = \sqrt{\bar{\alpha}_t} \, x_0 + \sqrt{1 - \bar{\alpha}_t} \, \epsilon_t
$$
In theory, without altering the fundamental formulation of DMs, the learnable denoiser network can be designed to predict any of these three components. Consistency models (CMs) follow this principle by training the denoiser to specifically predict the clean sample <span class="arithmatex">\(x_0\)</span>. The benefit of this approach is that CMs can naturally scale to perform the reverse process with few steps or even a single step.</p>
<figure class="figure">
<img alt="image-20250504161430743" src="/blog/md/one-step-diffusion-models.assets/image-20250504161430743.png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">A consistency model that learns to map any point on the ODE trajectory to the clean sample. <em>Source: Song et al., “Consistency Models.”</em></figcaption>
</figure>
<p>Formally, CMs learn a function <span class="arithmatex">\(f_\theta(x_t,t)\)</span> that maps noisy data <span class="arithmatex">\(x_t\)</span> at time <span class="arithmatex">\(t\)</span> directly to the clean data <span class="arithmatex">\(x_0\)</span>, satisfying:
$$
f_\theta(x_t, t) = f_\theta(x_{t'}, t') \quad \forall t, t'
$$
The model must also obey the differential consistency condition:
$$
\frac{d}{dt} f_\theta(x_t, t) = 0
$$
CMs are trained by minimizing the discrepancy between outputs at adjacent times, with the loss function:
$$
\mathcal{L} = \mathbb{E} \left[ d\left(f_\theta(x_t, t), f_\theta(x_{t'}, t')\right) \right]
$$
Similar to continuous-timestep DMs and discrete-timestep DMs, CMs also have continuous-time and discrete-time variants. Discrete-time CMs are easier to train, but are more sensitive to timestep scheduling and suffer from discretization errors. Continuous-time CMs, on the other hand, suffer from instability during training.</p>
<p>For a deeper discussion of the differences between the two variants of CMs, and how to stabilize continuous-time CMs, please refer to <em>Lu and Song, "Simplifying, Stabilizing and Scaling Continuous-Time Consistency Models."</em></p>
<h2>Shortcut Models</h2>
<p>Similar to distillation-based methods, the core idea of shortcut models is inspired by the "curved vector field" problem, but the shortcut models take a different approach to solve it.</p>
<p>Shortcut models are introduced in <em>Frans et al., "One Step Diffusion via Shortcut Models."</em> The paper presents the insight that conventional DMs perform badly when jumping with large step sizes stems from their lack of awareness of the step size they are set to jump forward. Since they are only trained to comply with small step sizes, they are only learning the tangents in the curved vector field, not the "correct direction" when a large step size is used.</p>
<p>Based on this insight, on top of <span class="arithmatex">\(x_t\)</span> and <span class="arithmatex">\(t\)</span>, shortcut models additionally include step size <span class="arithmatex">\(d\)</span> as part of the condition for the denoiser network. At small step sizes (<span class="arithmatex">\(d\rightarrow 0\)</span>), the model behaves like a standard flow-matching model, learning the expected tangent from noise to data. For larger step sizes, the model learns that one large step should equal two consecutive smaller steps (self-consistency), creating a binary recursive formulation. The model is trained by combining the standard flow matching loss when <span class="arithmatex">\(d=0\)</span> and the self-consistency loss when <span class="arithmatex">\(d&gt;0\)</span>:
$$
\mathcal{L} = \mathbb{E} [ \underbrace{| s_\theta(x_t, t, 0) - (x_1 - x_0)|^2}_{\text{Flow-Matching}} +
$$</p>
<div class="arithmatex">\[
\underbrace{\|s_\theta(x_t, t, 2d) - \mathbf{s}_{\text{target}}\|^2}_{\text{Self-Consistency}}],
\]</div>
<div class="arithmatex">\[
\quad \mathbf{s}_{\text{target}} = s_\theta(x_t, t, d)/2 + s_\theta(x'_{t+d}, t + d, d)/2 \quad
\]</div>
<div class="arithmatex">\[
\text{and} \quad x'_{t+d} = x_t + s_\theta(x_t, t, d)d
\]</div>
<figure class="figure">
<img alt="image-20250504180714955" src="/blog/md/one-step-diffusion-models.assets/image-20250504180714955.png" / class="figure-img img-fluid rounded">
<figcaption class="figure-caption">Illustration of the training process of shortcut models. <em>Source: Frans et al., “One Step Diffusion via Shortcut Models.”</em></figcaption>
</figure>
<p>Both consistency models and shortcut models can be seamlessly scaled between one-step and multi-step generation to balance quality and efficiency.</p></p>
</article>
<p class="text-center text-secondary" style="font-size: 0.8rem; font-family: 'Lato', sans-serif;">Copyright © 2025. Designed and implemented by Yan Lin.</p>
</main>
<button id="back-to-top" class="btn btn-light rounded-circle" style="position: fixed; bottom: 20px; right: 20px; display: none; z-index: 1000; width: 40px; height: 40px; padding: 0;"><i class="bi bi-chevron-up"></i></button>
</body>
</html>
<script>
document.addEventListener('DOMContentLoaded', function() {
document.querySelectorAll('img').forEach(function(img) {
img.classList.add('figure-img', 'rounded');
});
});
// Show or hide the back-to-top button
window.addEventListener('scroll', function() {
var backToTopButton = document.getElementById('back-to-top');
if (window.scrollY > 100) {
backToTopButton.style.display = 'block';
} else {
backToTopButton.style.display = 'none';
}
});
// Scroll to top when the button is clicked
document.getElementById('back-to-top').addEventListener('click', function(e) {
e.preventDefault();
window.scrollTo({
top: 0,
behavior: 'smooth'
});
window.location.href = '#';
return false;
});
</script>

96
dist/blog/index.html vendored Normal file
View file

@ -0,0 +1,96 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Yan Lin's Blog</title>
<link rel="icon" href="/logo.webp" type="image/x-icon">
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.7.2/font/bootstrap-icons.css" rel="stylesheet">
<link rel="stylesheet" href="/index.css">
</head>
<body>
<main class="container">
<header class="border-bottom lh-1 py-3 border-secondary">
<div class="row flex-nowrap justify-content-between align-items-center">
<div class="col-2">
<a class="link-secondary header-icon px-2 h4" href="/"><i class="bi bi-house-fill"></i></a>
</div>
<div class="col-8 text-center">
<div class="page-header-logo h2 m-0 fw-bold" style="font-family: 'Abril Fatface', serif;">Yan Lin's Blog</div>
</div>
<div class="col-2 text-end">
</div>
</div>
</header>
<article class="section mt-4">
<div class="list-group list-group-flush">
<div class="list-group-item px-0">
<a class="mb-1 paper-title blog-link" href="/blog/html/one-step-diffusion-models.html">One Step Diffusion Models</a> | <span class="paper-title text-muted">May 2025</span>
<p class="card-text mb-auto tldr">Despite the promising performance of diffusion models on continuous modality generation, one deficiency that is holding them back is their requirement for multi-step denoising processes, which can be computationally expensive. In this article, we examine recent works that aim to build diffusion models capable of performing sampling in one or a few steps.</p>
</div>
<div class="list-group-item px-0">
<a class="mb-1 paper-title blog-link" href="/blog/html/multi-modal-transformer.html">Multi-modal and Multi-function Transformers</a> | <span class="paper-title text-muted">April 2025</span>
<p class="card-text mb-auto tldr">Multi-modal and multi-function Transformers enables a single architecture to process diverse data types such as language, images, and videos simultaneously. These models employ techniques like vector quantization and lookup-free quantization to map different modalities into a unified embedding space, allowing the Transformer to handle them within the same sequence. Beyond processing multiple data types, these architectures can also combine different functionalities-such as auto-regressive language generation and diffusion-based image creation-within a single model.</p>
</div>
</div>
</article>
</main>
<footer>
<div class="container">
<p class="text-center text-secondary" style="font-size: 0.8rem; font-family: 'Lato', sans-serif;">
Copyright © 2025. Designed and implemented by Yan Lin.
</p>
</div>
</footer>
<button id="back-to-top" class="btn btn-light rounded-circle" style="position: fixed; bottom: 20px; right: 20px; display: none; z-index: 1000; width: 40px; height: 40px; padding: 0;"><i class="bi bi-chevron-up"></i></button>
<script>
// Show or hide the back-to-top button
window.addEventListener('scroll', function() {
var backToTopButton = document.getElementById('back-to-top');
if (window.scrollY > 100) {
backToTopButton.style.display = 'block';
} else {
backToTopButton.style.display = 'none';
}
});
// Scroll to top when the button is clicked
document.getElementById('back-to-top').addEventListener('click', function(e) {
e.preventDefault();
window.scrollTo({
top: 0,
behavior: 'smooth'
});
window.location.href = '#';
return false;
});
</script>
</body>
</html>

Binary file not shown.

After

Width:  |  Height:  |  Size: 134 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 351 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 375 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 173 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 136 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 421 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 730 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 154 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 304 KiB

148
dist/blog/md/multi-modal-transformer.md vendored Normal file
View file

@ -0,0 +1,148 @@
# Multi-modal and Multi-function Transformers
Transformers have gained immense popularity within deep learning and AI communities in recent years. Since their introduction in *Vaswani et al., "Attention Is All You Need"*, they have proven to be powerful sequential models across diverse domains, with thousands of variations and "improved versions." The rise of Large Language Models (LLMs), which largely use Transformers as their foundation, has led to another surge in research around this architecture. This trend has even led graph learning and Computer Vision (CV) communities to move beyond their established foundation models (i.e., GNNs and CNNs) and embrace Transformers. This explains the increasing prevalence of graph Transformers and image Transformers today.
> Han et al., “A Survey on Vision Transformer”; Khan et al., “Transformers in Vision”; Yun et al., “Graph Transformer Networks.”
Beyond "chasing the trend," using Transformer as a unified foundation model offers several advantages:
- Transformers excel at capturing long-term dependencies. Unlike GNNs and CNNs which require deeper network structures for longer context, Transformers natively support global dependency modeling through their self-attention mechanism. They also avoid global smoothing and vanishing gradient problems that hinder context length scaling in other network architectures.
- Transformers process sequences in parallel rather than sequentially, enabling full utilization of GPU acceleration. This advantage can be further enhanced with techniques like those described in *Dao et al., "FlashAttention."*
- Transformers are flexible network structures. They don't inherently enforce sequentiality—without positional encoding, the ordering of input steps to Transformers is equivalent. Through strategic permutation and positional encoding, Transformers can adapt to a wide range of structured and unstructured data.
- The development of LLMs has made many open-weight Transformer models available with strong natural language understanding capabilities. These Transformers can be prompted and fine-tuned to model other modalities such as spatiotemporal data and images while retaining their language modeling abilities, creating opportunities for developing multi-modal foundation models.
- From a practical perspective, using Transformer as a foundation allows reuse of technical infrastructure and optimizations developed over years, including efficient architecture designs, training pipelines, and specialized hardware.
In this article, we will briefly explore techniques for unifying multiple modalities (e.g., natural language and images) and multiple functionalities (e.g., language models and diffusion denoisers) within a single Transformer. These techniques are largely sourced from recent oral papers presented at ICML, ICLR, and CVPR conferences. I assume readers have general knowledge of basic concepts in ML and neural networks, Transformers, LLMs, and diffusion models.
Since images and language modalities represent continuous and discrete data respectively, we will use them as examples throughout this article. Keep in mind that the techniques introduced can be readily extended to other modalities, including spatiotemporal data.
# General Goal
The goal of a multi-modal Transformer is to create a model that can accept multi-modal inputs and produce multi-modal outputs. For example, instead of using a CNN-based image encoder and a Transformer-based language encoder to map image and language modalities to the latent space separately, a multi-modal Transformer would be able to process the combination of image and language (sentence) as a single sequence.
![image](multi-modal-transformer.assets/image.png)
> An example of “conventional” multi-modal fusion. Different modality is processed by separate models and fused at some point. Source: *Xiang, Hao, Runsheng Xu, and Jiaqi Ma. "HM-ViT: Hetero-modal vehicle-to-vehicle cooperative perception with vision transformer." CVPR, 2023.*
![image (1)](multi-modal-transformer.assets/image (1).png)
> An example of a Transformer that can handle multi-modal inputs and outputs. Different modalities are all projected into tokens and subsequently processed by a unified Transformer encoder. Source: *Kondratyuk, Dan, Lijun Yu, et al. “VideoPoet: A Large Language Model for Zero-Shot Video Generation,” ICML, 2024.*
Beyond multi-modal processing, a multi-function Transformer can, for example, function as both a language model (auto-regressive generation) and diffusion denoiser (score-matching generation) simultaneously, supporting two of the most common generation schemes used today.
# Modality Embedding
A fundamental challenge in unifying multiple modalities within a single Transformer is how to represent different modalities in the same embedding space. For the "QKV" self-attention mechanism to work properly, each item in the input sequence must be represented by an embedding vector of the same dimension, matching the "model dimension" of the Transformer.
![image (2)](multi-modal-transformer.assets/image (2).png)
> Illustration of the QKV self-attention mechanism in Transformer. [Source](https://en.wikipedia.org/wiki/Attention_(machine_learning))
The most common method for mapping language into the embedding space is through tokenization and token embedding. A tokenizer maps a word or word fragment into a discrete token index, and an index-fetching embedding layer (implemented in frameworks like PyTorch with `nn.Embedding`) maps this index into a fixed-dimension embedding vector. In principle, all discrete features can be mapped into the embedding space using this approach.
![1_Dk1X5rmLomXqqTPeuHgBpw](multi-modal-transformer.assets/1_Dk1X5rmLomXqqTPeuHgBpw.png)
> Visualization of tokenizer and index-fetching embedding layer. [Source](https://medium.com/@hunter-j-phillips/the-embedding-layer-27d9c980d124)
## Vector Quantization
For continuous features, one intuitive approach is to first tokenize them into discrete tokens, thereby unifying the embedding process across both discrete and continuous features. **Vector quantization**, introduced in VQ-VAE, is one of the most common methods for this purpose.
> Van Den Oord, Aaron, and Oriol Vinyals. "Neural discrete representation learning." NeurIPS, 2017.
Vector quantization maintains a "codebook" $\boldsymbol C \in \mathbb R^{n\times d}$, which functions similarly to the index-fetching embedding layer, where $n$ is the total number of unique tokens, and $d$ is the embedding size. A given continuous vector $\boldsymbol{z}\in\mathbb R^{d}$ is quantized into a discrete value $i\in\mathbb [0,n-1]$ by finding the closest row vector in $\boldsymbol C$ to $\boldsymbol{z}$, and that row vector $\boldsymbol C_i$ is fetched as the embedding for $\boldsymbol{z}$. Formally:
$$
i = \arg\min_j ||\boldsymbol z - \boldsymbol C_j||₂
$$
![Screen_Shot_2020-06-28_at_4.26.40_PM](multi-modal-transformer.assets/Screen_Shot_2020-06-28_at_4.26.40_PM.png)
## Lookup-Free Quantization
A significant limitation of vector quantization is that it requires calculating distances between the given continuous vectors and the entire codebook, which becomes computationally expensive for large-scale codebooks. This creates tension with the need for expanded codebooks to represent complex modalities such as images and videos. Research has shown that simply increasing the number of unique tokens doesn't always improve codebook performance.
> “A simple trick for training a larger codebook involves decreasing the code embedding dimension when increasing the vocabulary size.” Source: *Yu, Lijun, Jose Lezama, et al. “Language Model Beats Diffusion - Tokenizer Is Key to Visual Generation,” ICLR, 2024.*
Building on this insight, **Lookup-Free Quantization** (LFQ) eliminates the embedding dimension of codebooks (essentially reducing the embedding dimension to 0) and directly calculates the discrete index $i$ by individually quantizing each dimension of $\boldsymbol z$ into a binary digit. The index $i$ can then be computed by converting the binary representation to decimal. Formally:
$$
i=\sum_{j=1}^{d} 2^{(j-1)}\cdot 𝟙(z_j > 0)
$$
> For example, given a continuous vector $\boldsymbol z=\langle -0.52, 1.50, 0.53, -1.32\rangle$, we first quantize each dimension into $\langle 0, 1, 1, 0\rangle$, based on the sign of each dimension. The token index of $\boldsymbol z$ is simply the decimal equivalent of the binary 0110, which is 6.
However, this approach introduces another challenge: we still need an index-fetching embedding layer to map these token indices into embedding vectors for the Transformer. This, combined with the typically large number of unique tokens when using LFQ—a 32-dimensional $\boldsymbol z$ will result in $2^{32}=4,294,967,296$ unique tokens—creates significant efficiency problems. One solution is to factorize the token space. Effectively, this means splitting the binary digits into multiple parts, embedding each part separately, and concatenating the resulting embedding vectors. For example, with a 32-dimensional $\boldsymbol z$, if we quantize and embed its first and last 16 dimensions separately, we “only” need to handle $2^{16}*2= 131,072$ unique tokens.
Note that this section doesn't extensively explain how to map raw continuous features into the vector $\boldsymbol{z}$, as these techniques are relatively straightforward and depend on the specific feature type—for example, fully-connected layers for numerical features, or CNN/GNN with feature flattening for structured data.
## Quantization over Linear Projection
You might be asking—why can't we simply use linear projections to map the raw continuous features into the embedding space? What are the benefits of quantizing continuous features into discrete tokens?
Although Transformers are regarded as universal sequential models, they were designed for discrete tokens in their first introduction in *Vaswani et al., "Attention Is All You Need"*. Empirically, they have optimal performance when dealing with tokens, compared to continuous features. This is supported by many research papers claiming that quantizing continuous features improves the performance of Transformers, and works demonstrating Transformers' subpar performance when applied directly to continuous features.
> Mao, Chengzhi, Lu Jiang, Mostafa Dehghani, Carl Vondrick, Rahul Sukthankar, and Irfan Essa. “Discrete Representations Strengthen Vision Transformer Robustness,” ICLR, 2022.
> Ilbert, Romain, Ambroise Odonnat, et al. “SAMformer: Unlocking the Potential of Transformers in Time Series Forecasting with Sharpness-Aware Minimization and Channel-Wise Attention,” ICML, 2024.
On the other hand, unifying different modalities into tokens is especially beneficial in the context of Transformer-based "foundation models," since it preserves the auto-regressive next-token prediction architecture of LLMs. Combined with special tokens such as "start of sentence" and "end of sentence," the Transformer model is flexible in generating contents of mixed modalities with varied length.
> For example, by quantizing videos into discrete tokens and combining the token space of videos and language, one can create a unified Transformer model that generates both videos and language in one sequence. The start and end points of video and language sub-sequences are fully determined by the model, based on the specific input prompt. This structure would be difficult to replicate if we used tokenization for language but linear projection for videos.
# Transformer Backbone
After different modalities are mapped into the same embedding space, they can be arranged into a sequence of embedding vectors and input into a Transformer backbone. We don't discuss the variations of Transformer structure and improvement techniques here, as they are numerous, and ultimately function similarly as sequential models.
> Lan et al., “ALBERT”; Ye et al., “Differential Transformer”; Kitaev, Kaiser, and Levskaya, “Reformer”; Su et al., “RoFormer”; Dai et al., “Transformer-XL.”
As we know, the "full" Transformer structure proposed in *Vaswani et al., "Attention Is All You Need"* includes an encoder and a decoder. They perform self-attention within their respective input sequences, and the decoder additionally performs cross-attention between its input sequence and the memory sequence derived from the encoder's output. Some early language models use encoder-only structure (like *Devlin et al., "BERT"*) focused on outputting embedding vectors or encoder-decoder structure (like *Chung et al., "Scaling Instruction-Finetuned Language Models"*) for generating natural language output. Most modern large language models and foundation models use decoder-only structure (like *Brown et al., "Language Models Are Few-Shot Learners"*), focusing on auto-regressive generation of language output.
The encoder-only structure theoretically excels at representation learning, and its produced embedding vectors can be applied to various downstream tasks. Recent developments have gradually moved towards decoder-only structure, centered around the idea of building models that are capable of directly generating the required final output of every downstream task.
> For example, to perform sentiment analysis, BERT will compute an embedding vector for the query sentence, and the embedding vector can be used in a dedicated classifier to predict the sentiment label. GPT, on the other hand, can directly answer the question "what is the sentiment associated with the query sentence?" Comparatively, GPT is more versatile in most cases and can easily perform zero-shot prediction.
Nevertheless, representation learning is still a relevant topic. The general understanding is that decoder-only structure cannot perform conventional representation learning, for example mapping a sentence into a fixed-dimension embedding vector. Yet, there are a few works in the latest ICLR that shed light on the utilization of LLMs as representation learning or embedding models:
> Gao, Leo, Tom Dupre la Tour, Henk Tillman, Gabriel Goh, Rajan Troll, Alec Radford, Ilya Sutskever, Jan Leike, and Jeffrey Wu. “Scaling and Evaluating Sparse Autoencoders,” 2024. [Link](https://openreview.net/forum?id=tcsZt9ZNKD)
> Li, Ziyue, and Tianyi Zhou. “Your Mixture-of-Experts LLM Is Secretly an Embedding Model for Free,” 2024. [Link](https://openreview.net/forum?id=eFGQ97z5Cd)
> Zhang, Jie, Dongrui Liu, Chen Qian, Linfeng Zhang, Yong Liu, Yu Qiao, and Jing Shao. “REEF: Representation Encoding Fingerprints for Large Language Models,” 2024. [Link](https://openreview.net/forum?id=SnDmPkOJ0T)
# Output Layer
For language generation, Transformers typically use classifier output layers, mapping the latent vector of each item in the output sequence back to tokens. As we've established in the "modality embedding" section, the optimal method to embed continuous features is to quantize them into discrete tokens. Correspondingly, an intuitive method to output continuous features is to map these discrete tokens back to the continuous feature space, essentially reversing the vector quantization process.
## Reverse Vector Quantization
One approach to reverse vector quantization is readily available in VQ-VAE, since it is an auto-encoder. Given a token $i$, we can look up its embedding in the codebook as $\boldsymbol C_i$, then apply a decoder network to map $\boldsymbol C_i$ back to the continuous feature vector $\boldsymbol z$. The decoder network can be pre-trained in the VQ-VAE framework—pre-train the VQ-VAE tokenizer, encoder, and decoder using auto-encoding loss functions, or end-to-end trained along with the whole Transformer. In the NLP and CV communities, the pre-training approach is more popular, since there are many large-scale pre-trained auto-encoders available.
![image (4)](multi-modal-transformer.assets/image (4).png)
> The encoder-decoder structure of MAGVIT (*Yu et al., “MAGVIT”*), a visual VQ-VAE model. A 3D-VQ encoder quantizes a video into discrete tokens, and a 3D-VQ decoder maps them back to the pixel space.
## Efficiency Enhancement
For continuous feature generation, unlike language generation where the output tokens are the final output, we are essentially representing the final output with a limited size token space. Thus, for complicated continuous features like images and videos, we have to expand the token space or use more tokens to represent one image or one video frame to improve generation quality, which can result in efficiency challenges.
There are several workarounds to improve the efficiency of multi-modal outputs. One approach is to generate low-resolution outputs first, then use a separate super-resolution module to improve the quality of the output. This approach is explored in *Kondratyuk et al., "VideoPoet"* and *Tian et al., "Visual Autoregressive Modeling"*. Interestingly, the overall idea is very similar to nVidia's DLSS, where the graphics card renders a low-resolution frame (e.g., 1080p) using the conventional rasterization pipeline, then a super resolution model increases the frame's resolution (e.g., 4k) utilizing the graphics card's tensor hardware, improving games' overall frame rate.
Another workaround follows the idea of compression. Take video generation as an example. The model generates full features for key frames, and light-weight features for motion vectors that describe subtle differences from those key frames. This is essentially how inter-frame compressed video codecs work, which takes advantage of temporal redundancy between neighboring frames.
![image (5)](multi-modal-transformer.assets/image (5).png)
> Keys frames and motion vectors used in *Jin et al., “Video-LaVIT.”*
# Fuse with Diffusion Models
Despite continuous efforts to enable representation and generation of images and videos with a language model structure (auto-regressive), current research indicates that diffusion models (more broadly speaking, score-matching generative models) outperform language models on continuous feature generation. Score-matching generative models have their own separate and substantial community, with strong theoretical foundations and numerous variations emerging each year, such as stochastic differential equations, bayesian flow, and rectified flow. In conclusion, score-matching generative models are clearly here to stay alongside language models.
An intriguing question arises: why not integrate the structures of language models and diffusion models into one Transformer to reach the best of both worlds? *Zhou et al. in "Transfusion"* explored this idea. The approach is straightforward: build a Transformer that can handle both language and image inputs and outputs. The language component functions as a language model, while the image component serves as a denoiser network for diffusion models. The model is trained by combining the language modeling loss and DDPM loss, enabling it to function either as a language model or a text-to-image denoiser.
![image (6)](multi-modal-transformer.assets/image (6).png)
> A Transformer capable of function as a language model and a diffusion denoiser at the same time. Source: *Zhou, Chunting, Lili Yu, et al. “Transfusion: Predict the Next Token and Diffuse Images with One Multi-Modal Model,” ICLR, 2025.*
# Conclusion
In conclusion, the evolution of Transformers into versatile foundation models capable of handling multiple modalities and functionalities represents a significant advancement in AI research. By enabling a single architecture to process diverse data types through techniques like vector quantization and lookup-free quantization, researchers have created models that can seamlessly integrate language, images, and other modalities within the same embedding space.
In our research domain, we encounter even more diverse and domain-specific multi-modal data, such as traffic flows, trajectories, and real-world agent interactions. A unified Transformer for such data presents a promising solution for creating "foundation models" that generalize across diverse tasks and scenarios. However, domain-specific challenges, including data encoding and decoding, computational efficiency, and scalability, must be addressed to realize this potential.

Binary file not shown.

After

Width:  |  Height:  |  Size: 275 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 584 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 918 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 902 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 297 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 748 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 874 KiB

View file

@ -0,0 +1,137 @@
# One Step Diffusion Models
Despite the promising performance of diffusion models on continuous modality generation, one deficiency that is holding them back is their requirement for multi-step denoising processes, which can be computationally expensive. In this article, we examine recent works that aim to build diffusion models capable of performing sampling in one or a few steps.
---
# Background
Diffusion models (DMs), or more broadly speaking, score-matching generative models, have become the de facto framework for building deep generation models. They demonstrate exceptional generation performance, especially on continuous modalities including images, videos, audios, and spatiotemporal data.
Most diffusion models work by coupling a forward diffusion process and a reverse denoising diffusion process. The forward diffusion process gradually adds noise to the ground truth clean data $X_0$, until noisy data $X_T$ that follows a relatively simple distribution is reached. The reverse denoising diffusion process starts from the noisy data $X_T$, and removes the noise component step-by-step until clean generated data $X_0$ is reached. The reverse process is essentially a Monte-Carlo process, meaning it cannot be parallelized for each generation, which can be inefficient for a process with a large number of steps.
![image-20250503125941212](one-step-diffusion-models.assets/image-20250503125941212.png)
> The two processes in a typical diffusion model. *Source: Ho, Jain, and Abbeel, “Denoising Diffusion Probabilistic Models.”*
## Understanding DMs
There are many ways to understand how Diffusion Models (DMs) work. One of the most common and intuitive approaches is that a DM learns an ordinary differential equation (ODE) that transforms noise into data. Imagine an ODE vector field between the noise $X_T$ and clean data $X_0$. By training on sufficiently large numbers of timesteps $t\in [0,T]$, a DM is able to learn the vector (tangent) towards the cleaner data $X_{t-\Delta t}$, given any specific timestep $t$ and the corresponding noisy data $X_t$. This idea is easy to illustrate in a simplified 1-dimensional data scenario.
![image-20250503132738122](one-step-diffusion-models.assets/image-20250503132738122.png)
> Illustrated ODE flow of a diffusion model on 1-dimensional data. *Source: Song et al., “Score-Based Generative Modeling through Stochastic Differential Equations.”* It should be noted that as the figure suggests, there are differences between ODEs and DMs in a narrow sense. Flow matching models, a variant of DMs, more closely resemble ODEs.
## DMs Scale Poorly with Few Steps
Vanilla DDPM, which is essentially a discrete-timestep DM, can only perform the reverse process using the same number of steps it is trained on, typically thousands. DDIM introduces a reparameterization scheme that enables skipping steps during the reverse process of DDPM. Continuous-timestep DMs like Stochastic Differential Equations (SDE) naturally possess the capability of using fewer steps in the reverse process compared to the forward process/training.
> Ho, Jain, and Abbeel, “Denoising Diffusion Probabilistic Models.”
> Song, Meng, and Ermon, “Denoising Diffusion Implicit Models.”
> Song et al., “Score-Based Generative Modeling through Stochastic Differential Equations.”
Nevertheless, it is observed that their performance typically suffers catastrophic degradation when reducing the number of reverse process steps to single digits.
![image-20250503135351246](one-step-diffusion-models.assets/image-20250503135351246.png)
> Images generated by conventional DMs with only a few steps of reverse process. *Source: Frans et al., “One Step Diffusion via Shortcut Models.”*
To understand why DMs scale poorly with few reverse process steps, we can return to the ODE vector field perspective of DMs. When the target data distribution is complex, the vector field typically contains numerous intersections. When a given $X_t$ and $t$ is at these intersections, the vector points to the averaged direction of all candidates. This causes the generated data to approach the mean of the training data when only a few reverse process steps are used. Another explanation is that the learned vector field is highly curved. Using only a few reverse process steps means attempting to approximate these curves with polylines, which is inherently difficult.
![image-20250503141422791](one-step-diffusion-models.assets/image-20250503141422791.png)
> Illustration of the why DMs scale poorly with few reverse process steps. *Source: Frans et al., “One Step Diffusion via Shortcut Models.”*
We will introduce two branches of methods that aim to scale DMs to few or even reverse process steps: **distillation-based**, which distillates a pre-trained DM into a one-step model; and **end-to-end-based**, which trains a one-step DM from scratch.
# Distallation
Distillation-based methods are also called **rectified flow** methods. Their idea follows the above insight of "curved ODE vector field": if the curved vectors (flows) are hindering the scaling of reverse process steps, can we try to straighten these vectors so that they are easy to approximate with polylines or even straight lines?
*Liu, Gong, and Liu, "Flow Straight and Fast"* implements this idea, focusing on learning an ODE that follows straight vectors as much as possible. In the context of continuous time DMs where $T=1$ and and $t\in[0,1]$, suppose the clean data $X_0$ and noise $X_1$ each follows a data distribution, $X_0\sim \pi_0$ and $X_1\sim \pi_1$. The "straight vectors" can be achieved by solving a nonlinear least squares optimization problem:
$$
\min_{v} \int_{0}^{1} \mathbb{E}\left[\left\|\left(X_{1}-X_{0}\right)-v\left(X_{t}, t\right)\right\|^{2}\right] \mathrm{d} t,
$$
$$
\quad X_{t}=t X_{1}+(1-t) X_{0}
$$
Where $v$ is the vector field of the ODE $dZ_t = v(Z_t,t)dt$.
Though straightforward, when the clean data distribution $\pi_0$ is very complicated, the ideal result of completely straight vectors can be hard to achieve. To address this, a "reflow" procedure is introduced. This procedure iteratively trains new rectified flows using data generated by previously obtained flows:
$$
Z^{(k+1)} = RectFlow((Z_0^k, Z_1^k))
$$
This procedure produces increasingly straight flows that can be simulated with very few steps, ideally one step after several iterations.
![image-20250504142749208](one-step-diffusion-models.assets/image-20250504142749208.png)
> Illustrations of vector fields after different times of reflow processes. *Source: Liu, Gong, and Liu, “Flow Straight and Fast.”*
In practice, distillation-based methods are usually trained in two stages: first train a normal DM, and later distill one-step capabilities into it. This introduces additional computational overhead and complexity.
# End-to-end
Compared to distillation-based methods, end-to-end-based methods train a one-step-capable diffusion model (DM) within a single training run. Various techniques are used to implement such methods. We will focus on two of them: **consistency models** and **shortcut models**.
## Consistency Models
In discrete-timestep diffusion models (DMs), three components in the reverse denoising diffusion process are interchangeable through reparameterization: the noise component $\epsilon_t$ to remove, the less noisy previous step $x_{t-1}$, and the predicted clean sample $x_0$. This interchangeability is enabled by the following equation:
$$
x_t = \sqrt{\bar{\alpha}_t} \, x_0 + \sqrt{1 - \bar{\alpha}_t} \, \epsilon_t
$$
In theory, without altering the fundamental formulation of DMs, the learnable denoiser network can be designed to predict any of these three components. Consistency models (CMs) follow this principle by training the denoiser to specifically predict the clean sample $x_0$. The benefit of this approach is that CMs can naturally scale to perform the reverse process with few steps or even a single step.
![image-20250504161430743](one-step-diffusion-models.assets/image-20250504161430743.png)
> A consistency model that learns to map any point on the ODE trajectory to the clean sample. *Source: Song et al., “Consistency Models.”*
Formally, CMs learn a function $f_\theta(x_t,t)$ that maps noisy data $x_t$ at time $t$ directly to the clean data $x_0$, satisfying:
$$
f_\theta(x_t, t) = f_\theta(x_{t'}, t') \quad \forall t, t'
$$
The model must also obey the differential consistency condition:
$$
\frac{d}{dt} f_\theta(x_t, t) = 0
$$
CMs are trained by minimizing the discrepancy between outputs at adjacent times, with the loss function:
$$
\mathcal{L} = \mathbb{E} \left[ d\left(f_\theta(x_t, t), f_\theta(x_{t'}, t')\right) \right]
$$
Similar to continuous-timestep DMs and discrete-timestep DMs, CMs also have continuous-time and discrete-time variants. Discrete-time CMs are easier to train, but are more sensitive to timestep scheduling and suffer from discretization errors. Continuous-time CMs, on the other hand, suffer from instability during training.
For a deeper discussion of the differences between the two variants of CMs, and how to stabilize continuous-time CMs, please refer to *Lu and Song, "Simplifying, Stabilizing and Scaling Continuous-Time Consistency Models."*
## Shortcut Models
Similar to distillation-based methods, the core idea of shortcut models is inspired by the "curved vector field" problem, but the shortcut models take a different approach to solve it.
Shortcut models are introduced in *Frans et al., "One Step Diffusion via Shortcut Models."* The paper presents the insight that conventional DMs perform badly when jumping with large step sizes stems from their lack of awareness of the step size they are set to jump forward. Since they are only trained to comply with small step sizes, they are only learning the tangents in the curved vector field, not the "correct direction" when a large step size is used.
Based on this insight, on top of $x_t$ and $t$, shortcut models additionally include step size $d$ as part of the condition for the denoiser network. At small step sizes ($d\rightarrow 0$), the model behaves like a standard flow-matching model, learning the expected tangent from noise to data. For larger step sizes, the model learns that one large step should equal two consecutive smaller steps (self-consistency), creating a binary recursive formulation. The model is trained by combining the standard flow matching loss when $d=0$ and the self-consistency loss when $d>0$:
$$
\mathcal{L} = \mathbb{E} [ \underbrace{\| s_\theta(x_t, t, 0) - (x_1 - x_0)\|^2}_{\text{Flow-Matching}} +
$$
$$
\underbrace{\|s_\theta(x_t, t, 2d) - \mathbf{s}_{\text{target}}\|^2}_{\text{Self-Consistency}}],
$$
$$
\quad \mathbf{s}_{\text{target}} = s_\theta(x_t, t, d)/2 + s_\theta(x'_{t+d}, t + d, d)/2 \quad
$$
$$
\text{and} \quad x'_{t+d} = x_t + s_\theta(x_t, t, d)d
$$
![image-20250504180714955](one-step-diffusion-models.assets/image-20250504180714955.png)
> Illustration of the training process of shortcut models. *Source: Frans et al., “One Step Diffusion via Shortcut Models.”*
Both consistency models and shortcut models can be seamlessly scaled between one-step and multi-step generation to balance quality and efficiency.

116
dist/blog/template.html vendored Normal file
View file

@ -0,0 +1,116 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Yan Lin's Blog - {{ title }}</title>
<link rel="icon" href="/logo.webp" type="image/x-icon">
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.7.2/font/bootstrap-icons.css" rel="stylesheet">
<link rel="stylesheet" href="/index.css">
<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
<script>
MathJax = {
tex: {
inlineMath: [['$', '$'], ['\\(', '\\)']],
displayMath: [['$$', '$$'], ['\\[', '\\]']]
},
options: {
skipHtmlTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code'],
processHtmlClass: 'arithmatex'
}
};
</script>
<style>
a {
font-family: 'Lato', sans-serif;
}
img, .figure {
max-width: min(100%, 800px);
height: auto;
display: block;
margin-left: auto;
margin-right: auto;
}
.blog-title {
font-size: calc(1.35rem + 0.9vw);
font-weight: bold;
}
h1 {
font-size: calc(1.35rem + 0.6vw);
margin-top: 2rem;
}
h2 {
font-size: calc(1.1rem + 0.4vw);
margin-top: 1.5rem;
}
h3 {
font-size: calc(0.95rem + 0.1vw);
font-weight: bold;
margin-top: 1rem;
}
</style>
</head>
<body>
<div class="container">
<header class="border-bottom lh-1 py-3 border-secondary">
<div class="row flex-nowrap justify-content-between align-items-center">
<div class="col-2">
<a class="link-secondary header-icon px-2 h4" href="/"><i class="bi bi-house-fill"></i></a>
</div>
<div class="col-8 text-center">
<div class="page-header-logo h2 m-0 fw-bold" style="font-family: 'Abril Fatface', serif;">Yan Lin's Blog</div>
</div>
<div class="col-2 text-end">
<a class="link-secondary header-icon px-2 h4" href="/blog"><i class="bi bi-list-task"></i></a>
</div>
</div>
</header>
</div>
<main class="container">
<article class="section col-xl-10 col-xxl-9 mx-auto">
{{ content }}
</article>
<p class="text-center text-secondary" style="font-size: 0.8rem; font-family: 'Lato', sans-serif;">Copyright © 2025. Designed and implemented by Yan Lin.</p>
</main>
<button id="back-to-top" class="btn btn-light rounded-circle" style="position: fixed; bottom: 20px; right: 20px; display: none; z-index: 1000; width: 40px; height: 40px; padding: 0;"><i class="bi bi-chevron-up"></i></button>
</body>
</html>
<script>
document.addEventListener('DOMContentLoaded', function() {
document.querySelectorAll('img').forEach(function(img) {
img.classList.add('figure-img', 'rounded');
});
});
// Show or hide the back-to-top button
window.addEventListener('scroll', function() {
var backToTopButton = document.getElementById('back-to-top');
if (window.scrollY > 100) {
backToTopButton.style.display = 'block';
} else {
backToTopButton.style.display = 'none';
}
});
// Scroll to top when the button is clicked
document.getElementById('back-to-top').addEventListener('click', function(e) {
e.preventDefault();
window.scrollTo({
top: 0,
behavior: 'smooth'
});
window.location.href = '#';
return false;
});
</script>

Binary file not shown.

94
dist/fonts/Abril_Fatface/OFL.txt vendored Normal file
View file

@ -0,0 +1,94 @@
Copyright (c) 2011, TypeTogether (www.type-together.com),
with Reserved Font Names "Abril" and "Abril Fatface"
This Font Software is licensed under the SIL Open Font License, Version 1.1.
This license is copied below, and is also available with a FAQ at:
https://openfontlicense.org
-----------------------------------------------------------
SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
-----------------------------------------------------------
PREAMBLE
The goals of the Open Font License (OFL) are to stimulate worldwide
development of collaborative font projects, to support the font creation
efforts of academic and linguistic communities, and to provide a free and
open framework in which fonts may be shared and improved in partnership
with others.
The OFL allows the licensed fonts to be used, studied, modified and
redistributed freely as long as they are not sold by themselves. The
fonts, including any derivative works, can be bundled, embedded,
redistributed and/or sold with any software provided that any reserved
names are not used by derivative works. The fonts and derivatives,
however, cannot be released under any other type of license. The
requirement for fonts to remain under this license does not apply
to any document created using the fonts or their derivatives.
DEFINITIONS
"Font Software" refers to the set of files released by the Copyright
Holder(s) under this license and clearly marked as such. This may
include source files, build scripts and documentation.
"Reserved Font Name" refers to any names specified as such after the
copyright statement(s).
"Original Version" refers to the collection of Font Software components as
distributed by the Copyright Holder(s).
"Modified Version" refers to any derivative made by adding to, deleting,
or substituting -- in part or in whole -- any of the components of the
Original Version, by changing formats or by porting the Font Software to a
new environment.
"Author" refers to any designer, engineer, programmer, technical
writer or other person who contributed to the Font Software.
PERMISSION & CONDITIONS
Permission is hereby granted, free of charge, to any person obtaining
a copy of the Font Software, to use, study, copy, merge, embed, modify,
redistribute, and sell modified and unmodified copies of the Font
Software, subject to the following conditions:
1) Neither the Font Software nor any of its individual components,
in Original or Modified Versions, may be sold by itself.
2) Original or Modified Versions of the Font Software may be bundled,
redistributed and/or sold with any software, provided that each copy
contains the above copyright notice and this license. These can be
included either as stand-alone text files, human-readable headers or
in the appropriate machine-readable metadata fields within text or
binary files as long as those fields can be easily viewed by the user.
3) No Modified Version of the Font Software may use the Reserved Font
Name(s) unless explicit written permission is granted by the corresponding
Copyright Holder. This restriction only applies to the primary font name as
presented to the users.
4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
Software shall not be used to promote, endorse or advertise any
Modified Version, except to acknowledge the contribution(s) of the
Copyright Holder(s) and the Author(s) or with their explicit written
permission.
5) The Font Software, modified or unmodified, in part or in whole,
must be distributed entirely under this license, and must not be
distributed under any other license. The requirement for fonts to
remain under this license does not apply to any document created
using the Font Software.
TERMINATION
This license becomes null and void if any of the above conditions are
not met.
DISCLAIMER
THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
OTHER DEALINGS IN THE FONT SOFTWARE.

Binary file not shown.

97
dist/fonts/Domine/OFL.txt vendored Normal file
View file

@ -0,0 +1,97 @@
Copyright 2020 The Domine Project Authors (https://github.com/googlefonts/domine)
Copyright (c) 2012, Pablo Impallari (www.impallari.com|impallari@gmail.com),
Copyright (c) 2012, Pablo Impallari (www.impallari.com|impallari@gmail.com),
Copyright (c) 2012, Rodrigo Fuenzalida (www.rfuenzalida.com|hello@rfuenzalida.com),
Copyright (c) 2012, Brenda Gallo (gbrenda1987@gmail.com), with Reserved Font Name Domine.
This Font Software is licensed under the SIL Open Font License, Version 1.1.
This license is copied below, and is also available with a FAQ at:
https://openfontlicense.org
-----------------------------------------------------------
SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
-----------------------------------------------------------
PREAMBLE
The goals of the Open Font License (OFL) are to stimulate worldwide
development of collaborative font projects, to support the font creation
efforts of academic and linguistic communities, and to provide a free and
open framework in which fonts may be shared and improved in partnership
with others.
The OFL allows the licensed fonts to be used, studied, modified and
redistributed freely as long as they are not sold by themselves. The
fonts, including any derivative works, can be bundled, embedded,
redistributed and/or sold with any software provided that any reserved
names are not used by derivative works. The fonts and derivatives,
however, cannot be released under any other type of license. The
requirement for fonts to remain under this license does not apply
to any document created using the fonts or their derivatives.
DEFINITIONS
"Font Software" refers to the set of files released by the Copyright
Holder(s) under this license and clearly marked as such. This may
include source files, build scripts and documentation.
"Reserved Font Name" refers to any names specified as such after the
copyright statement(s).
"Original Version" refers to the collection of Font Software components as
distributed by the Copyright Holder(s).
"Modified Version" refers to any derivative made by adding to, deleting,
or substituting -- in part or in whole -- any of the components of the
Original Version, by changing formats or by porting the Font Software to a
new environment.
"Author" refers to any designer, engineer, programmer, technical
writer or other person who contributed to the Font Software.
PERMISSION & CONDITIONS
Permission is hereby granted, free of charge, to any person obtaining
a copy of the Font Software, to use, study, copy, merge, embed, modify,
redistribute, and sell modified and unmodified copies of the Font
Software, subject to the following conditions:
1) Neither the Font Software nor any of its individual components,
in Original or Modified Versions, may be sold by itself.
2) Original or Modified Versions of the Font Software may be bundled,
redistributed and/or sold with any software, provided that each copy
contains the above copyright notice and this license. These can be
included either as stand-alone text files, human-readable headers or
in the appropriate machine-readable metadata fields within text or
binary files as long as those fields can be easily viewed by the user.
3) No Modified Version of the Font Software may use the Reserved Font
Name(s) unless explicit written permission is granted by the corresponding
Copyright Holder. This restriction only applies to the primary font name as
presented to the users.
4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
Software shall not be used to promote, endorse or advertise any
Modified Version, except to acknowledge the contribution(s) of the
Copyright Holder(s) and the Author(s) or with their explicit written
permission.
5) The Font Software, modified or unmodified, in part or in whole,
must be distributed entirely under this license, and must not be
distributed under any other license. The requirement for fonts to
remain under this license does not apply to any document created
using the Font Software.
TERMINATION
This license becomes null and void if any of the above conditions are
not met.
DISCLAIMER
THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
OTHER DEALINGS IN THE FONT SOFTWARE.

66
dist/fonts/Domine/README.txt vendored Normal file
View file

@ -0,0 +1,66 @@
Domine Variable Font
====================
This download contains Domine as both a variable font and static fonts.
Domine is a variable font with this axis:
wght
This means all the styles are contained in a single file:
Domine/Domine-VariableFont_wght.ttf
If your app fully supports variable fonts, you can now pick intermediate styles
that arent available as static fonts. Not all apps support variable fonts, and
in those cases you can use the static font files for Domine:
Domine/static/Domine-Regular.ttf
Domine/static/Domine-Medium.ttf
Domine/static/Domine-SemiBold.ttf
Domine/static/Domine-Bold.ttf
Get started
-----------
1. Install the font files you want to use
2. Use your app's font picker to view the font family and all the
available styles
Learn more about variable fonts
-------------------------------
https://developers.google.com/web/fundamentals/design-and-ux/typography/variable-fonts
https://variablefonts.typenetwork.com
https://medium.com/variable-fonts
In desktop apps
https://theblog.adobe.com/can-variable-fonts-illustrator-cc
https://helpx.adobe.com/nz/photoshop/using/fonts.html#variable_fonts
Online
https://developers.google.com/fonts/docs/getting_started
https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_Fonts/Variable_Fonts_Guide
https://developer.microsoft.com/en-us/microsoft-edge/testdrive/demos/variable-fonts
Installing fonts
MacOS: https://support.apple.com/en-us/HT201749
Linux: https://www.google.com/search?q=how+to+install+a+font+on+gnu%2Blinux
Windows: https://support.microsoft.com/en-us/help/314960/how-to-install-or-remove-a-font-in-windows
Android Apps
https://developers.google.com/fonts/docs/android
https://developer.android.com/guide/topics/ui/look-and-feel/downloadable-fonts
License
-------
Please read the full license text (OFL.txt) to understand the permissions,
restrictions and requirements for usage, redistribution, and modification.
You can use them in your products & projects print or digital,
commercial or otherwise.
This isn't legal advice, please consider consulting a lawyer and see the full
license for all details.

BIN
dist/fonts/Domine/static/Domine-Bold.ttf vendored Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
dist/fonts/Lato/Lato-Black.ttf vendored Normal file

Binary file not shown.

BIN
dist/fonts/Lato/Lato-BlackItalic.ttf vendored Normal file

Binary file not shown.

BIN
dist/fonts/Lato/Lato-Bold.ttf vendored Normal file

Binary file not shown.

BIN
dist/fonts/Lato/Lato-BoldItalic.ttf vendored Normal file

Binary file not shown.

BIN
dist/fonts/Lato/Lato-Italic.ttf vendored Normal file

Binary file not shown.

BIN
dist/fonts/Lato/Lato-Light.ttf vendored Normal file

Binary file not shown.

BIN
dist/fonts/Lato/Lato-LightItalic.ttf vendored Normal file

Binary file not shown.

BIN
dist/fonts/Lato/Lato-Regular.ttf vendored Normal file

Binary file not shown.

BIN
dist/fonts/Lato/Lato-Thin.ttf vendored Normal file

Binary file not shown.

BIN
dist/fonts/Lato/Lato-ThinItalic.ttf vendored Normal file

Binary file not shown.

93
dist/fonts/Lato/OFL.txt vendored Normal file
View file

@ -0,0 +1,93 @@
Copyright (c) 2010-2014 by tyPoland Lukasz Dziedzic (team@latofonts.com) with Reserved Font Name "Lato"
This Font Software is licensed under the SIL Open Font License, Version 1.1.
This license is copied below, and is also available with a FAQ at:
https://openfontlicense.org
-----------------------------------------------------------
SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
-----------------------------------------------------------
PREAMBLE
The goals of the Open Font License (OFL) are to stimulate worldwide
development of collaborative font projects, to support the font creation
efforts of academic and linguistic communities, and to provide a free and
open framework in which fonts may be shared and improved in partnership
with others.
The OFL allows the licensed fonts to be used, studied, modified and
redistributed freely as long as they are not sold by themselves. The
fonts, including any derivative works, can be bundled, embedded,
redistributed and/or sold with any software provided that any reserved
names are not used by derivative works. The fonts and derivatives,
however, cannot be released under any other type of license. The
requirement for fonts to remain under this license does not apply
to any document created using the fonts or their derivatives.
DEFINITIONS
"Font Software" refers to the set of files released by the Copyright
Holder(s) under this license and clearly marked as such. This may
include source files, build scripts and documentation.
"Reserved Font Name" refers to any names specified as such after the
copyright statement(s).
"Original Version" refers to the collection of Font Software components as
distributed by the Copyright Holder(s).
"Modified Version" refers to any derivative made by adding to, deleting,
or substituting -- in part or in whole -- any of the components of the
Original Version, by changing formats or by porting the Font Software to a
new environment.
"Author" refers to any designer, engineer, programmer, technical
writer or other person who contributed to the Font Software.
PERMISSION & CONDITIONS
Permission is hereby granted, free of charge, to any person obtaining
a copy of the Font Software, to use, study, copy, merge, embed, modify,
redistribute, and sell modified and unmodified copies of the Font
Software, subject to the following conditions:
1) Neither the Font Software nor any of its individual components,
in Original or Modified Versions, may be sold by itself.
2) Original or Modified Versions of the Font Software may be bundled,
redistributed and/or sold with any software, provided that each copy
contains the above copyright notice and this license. These can be
included either as stand-alone text files, human-readable headers or
in the appropriate machine-readable metadata fields within text or
binary files as long as those fields can be easily viewed by the user.
3) No Modified Version of the Font Software may use the Reserved Font
Name(s) unless explicit written permission is granted by the corresponding
Copyright Holder. This restriction only applies to the primary font name as
presented to the users.
4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
Software shall not be used to promote, endorse or advertise any
Modified Version, except to acknowledge the contribution(s) of the
Copyright Holder(s) and the Author(s) or with their explicit written
permission.
5) The Font Software, modified or unmodified, in part or in whole,
must be distributed entirely under this license, and must not be
distributed under any other license. The requirement for fonts to
remain under this license does not apply to any document created
using the Font Software.
TERMINATION
This license becomes null and void if any of the above conditions are
not met.
DISCLAIMER
THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
OTHER DEALINGS IN THE FONT SOFTWARE.

BIN
dist/fonts/georgia/georgia.ttf vendored Normal file

Binary file not shown.

BIN
dist/fonts/georgia/georgiab.ttf vendored Normal file

Binary file not shown.

BIN
dist/fonts/georgia/georgiai.ttf vendored Normal file

Binary file not shown.

BIN
dist/fonts/georgia/georgiaz.ttf vendored Normal file

Binary file not shown.

245
dist/index.css vendored Normal file
View file

@ -0,0 +1,245 @@
/* Font declarations */
@font-face {
font-family: 'Lato';
src: url('/fonts/Lato/Lato-Regular.ttf') format('truetype');
font-weight: normal;
font-style: normal;
font-display: swap;
}
@font-face {
font-family: 'Lato';
src: url('/fonts/Lato/Lato-Bold.ttf') format('truetype');
font-weight: bold;
font-style: normal;
font-display: swap;
}
@font-face {
font-family: 'Domine';
src: url('/fonts/Domine/static/Domine-Regular.ttf') format('truetype');
font-weight: normal;
font-style: normal;
font-display: swap;
}
@font-face {
font-family: 'Domine';
src: url('/fonts/Domine/static/Domine-Bold.ttf') format('truetype');
font-weight: 700;
font-style: normal;
font-display: swap;
}
@font-face {
font-family: 'Abril Fatface';
src: url('/fonts/Abril_Fatface/AbrilFatface-Regular.ttf') format('truetype');
font-weight: normal;
font-style: normal;
font-display: swap;
}
@font-face {
font-family: 'Georgia';
src: url('/fonts/georgia/georgia.ttf') format('truetype');
font-weight: normal;
font-style: normal;
font-display: swap;
}
@font-face {
font-family: 'Georgia';
src: url('/fonts/georgia/georgiab.ttf') format('truetype');
font-weight: bold;
font-style: normal;
font-display: swap;
}
@font-face {
font-family: 'Georgia';
src: url('/fonts/georgia/georgiai.ttf') format('truetype');
font-weight: normal;
font-style: italic;
font-display: swap;
}
@font-face {
font-family: 'Georgia';
src: url('/fonts/georgia/georgiaz.ttf') format('truetype');
font-weight: bold;
font-style: italic;
font-display: swap;
}
:root {
--main-font-family: Georgia, "Times New Roman", serif;
/* Light mode variables */
--background-color: #fff;
--background-secondary: #f8f9fa;
--text-color: #212529;
--text-secondary: #6c757d;
--border-color: #dee2e6;
--shadow-color: rgba(0, 0, 0, 0.15);
--primary-text: #58151c;
--secondary-text: #052c65;
--link-hover-color: #555;
}
@media (prefers-color-scheme: dark) {
:root {
/* Dark mode variables */
--background-color: #212529;
--background-secondary: #343a40;
--text-color: #f8f9fa;
--text-secondary: #adb5bd;
--border-color: #495057;
--shadow-color: rgba(0, 0, 0, 0.5);
--primary-text: #ffddb3;
--secondary-text: #c6e2ff;
--link-hover-color: #ddd;
}
}
html, body {
height: 100%;
margin: 0;
}
body {
font-family: var(--main-font-family);
background-color: var(--background-color);
color: var(--text-color);
display: flex;
flex-direction: column;
min-height: 100vh;
}
/* Make main content grow to push footer down */
main.container {
flex: 1 0 auto;
}
/* Dark mode overrides for Bootstrap components */
@media (prefers-color-scheme: dark) {
.bg-body-secondary {
background-color: var(--background-secondary) !important;
}
.text-body-emphasis {
color: var(--text-color) !important;
}
.border, .border-bottom {
border-color: var(--border-color) !important;
}
.link-secondary {
color: var(--text-secondary) !important;
}
.shadow-sm, .shadow {
box-shadow: 0 .125rem .25rem var(--shadow-color) !important;
}
.btn-light {
background-color: var(--background-secondary);
color: var(--text-color);
border-color: var(--border-color);
}
.list-group-flush .list-group-item {
background-color: transparent;
color: var(--text-color);
border-color: var(--border-color);
}
.text-muted {
color: var(--text-secondary) !important;
}
.figure-caption {
color: var(--text-secondary) !important;
}
}
.link {
font-family: 'Lato', monospace;
color: var(--link-color);
}
.link:hover {
color: var(--link-hover-color);
}
.blog-link {
color: var(--link-color);
}
.blog-link:hover {
color: var(--link-hover-color);
}
.section {
margin-top: 2rem;
margin-bottom: 2rem;
}
.paper-container {
padding: .8rem;
}
.paper-title {
font-size: calc(1.0rem + 0.1vw);
font-weight: 500;
}
.paper-link {
font-size: calc(0.7rem + 0.1vw);
}
.venue-name {
font-size: calc(0.85rem + 0.1vw);
font-weight: 500;
}
.author-name, .project-desc, .tldr {
font-size: calc(0.7rem + 0.1vw);
}
.primary-text {
color: var(--primary-text);
}
.secondary-text {
color: var(--secondary-text);
}
.blog-title {
font-family: 'Domine', serif;
font-weight: 700;
}
blockquote {
border-left: 4px solid var(--border-color);
margin: 1.5em 0;
padding: 0.5em 1em;
background-color: var(--background-secondary);
}
blockquote p {
margin: 0;
}
@media (prefers-color-scheme: dark) {
blockquote {
border-left-color: var(--border-color);
}
}
footer {
margin-top: 0rem;
padding: 1rem 0;
width: 100%;
flex-shrink: 0;
}

746
dist/index.html vendored Normal file
View file

@ -0,0 +1,746 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Yan Lin's Homepage</title>
<link rel="icon" href="/logo.webp" type="image/x-icon">
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.7.2/font/bootstrap-icons.css" rel="stylesheet">
<link rel="stylesheet" href="/index.css">
</head>
<body>
<main class="container">
<header class="border-bottom lh-1 pt-3 pb-0 border-secondary">
<div class="row flex-nowrap justify-content-between align-items-center">
<div class="col-2">
<a class="link-secondary header-icon px-2 h4" href="mailto:s@yanlincs.com"><i class="bi bi-envelope-fill"></i></a>
</div>
<div class="col-8 text-center">
<div class="page-header-logo h2 m-0 fw-bold" style="font-family: 'Abril Fatface', serif;">Yan Lin's Homepage</div>
</div>
<div class="col-2 text-end">
<a class="link-secondary header-icon px-2 h4" href="https://lab.yanlincs.com"><i class="bi bi-stack"></i></a>
</div>
</div>
<nav class="navbar navbar-expand">
<ul class="navbar-nav d-flex justify-content-evenly mx-auto gap-3 gap-md-5">
<li class="nav-item">
<a class="link nav-link px-0" href="/#publications">Publications</a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#projects">Projects</a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#presentations">Presentations</a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#services">Services</a>
</li>
</ul>
</nav>
</header>
<div class="row g-0 border rounded text-body-emphasis bg-body-secondary flex-md-row my-4 position-relative shadow-sm transition-shadow" style="transition: box-shadow 0.2s ease-in-out;" onmouseover="this.classList.remove('shadow-sm'); this.classList.add('shadow')" onmouseout="this.classList.remove('shadow'); this.classList.add('shadow-sm')">
<div class="col p-4 d-flex flex-column position-static">
<h2 class="fst-italic mb-3">Biography - Yan Lin</h2>
<p class="card-text mb-auto" style="font-size: 1.1rem;">
I am currently a postdoctoral researcher in the Department of Computer Science at Aalborg University.
I received my PhD and Bachelor's degrees from Beijing Jiaotong University, China.
My research interests include <i>spatiotemporal data mining</i>, <i>representation learning</i>, and <i>AI for science</i>.
</p>
</div>
<div class="col-5 col-xl-4 col-xxl-3 d-none d-lg-block d-flex align-items-center">
<img src="/profile.webp" alt="Yan Lin" class="rounded w-100" style="object-fit: contain;">
</div>
</div>
<article class="section" id="publications">
<div class="d-flex justify-content-between align-items-center mb-1">
<h2 class="section-title d-inline-block mb-0"><i class="bi bi-book"></i> Publications</h2>
<a class="mb-0 link link-secondary link-underline-opacity-0 h5" href="/publications/">View All <i class="bi bi-arrow-right-circle"></i></a>
</div>
<div>
<div id="primary-publications" class="list-group list-group-flush">
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
IEEE TKDE<span class='text-muted'> | </span>2025
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2402.07232" target="_blank" rel="noopener noreferrer">Preprint</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/Logan-Lin/UVTM" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">UVTM: Universal Vehicle Trajectory Modeling with ST Feature Domain Generation</h5>
<p class="card-text mb-auto author-name">Yan Lin, Jilin Hu, Shengnan Guo, Bin Yang, Christian S. Jensen, Youfang Lin, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
IJCAI<span class='text-muted'> | </span>2025
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2405.12459" target="_blank" rel="noopener noreferrer">Preprint</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/Zeru19/PLM4Traj" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">TrajCogn: Leveraging LLMs for Cognizing Movement Patterns and Travel Purposes from Trajectories</h5>
<p class="card-text mb-auto author-name">Zeyu Zhou*, <strong>Yan Lin*</strong>, Haomin Wen, Shengnan Guo, Jilin Hu, Youfang Lin, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
IEEE TKDE<span class='text-muted'> | </span>2025
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://ieeexplore.ieee.org/document/10818577" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2407.12550" target="_blank" rel="noopener noreferrer">Preprint</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/Logan-Lin/UniTE" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">UniTE: A Survey and Unified Pipeline for Pre-training Spatiotemporal Trajectory Embeddings</h5>
<p class="card-text mb-auto author-name"><strong>Yan Lin</strong>, Zeyu Zhou, Yicheng Liu, Haochen Lv, Haomin Wen, Tianyi Li, Yushuai Li, Christian S. Jensen, Shengnan Guo, Youfang Lin, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
WWW<span class='text-muted'> | </span>2025
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://openreview.net/forum?id=KmMSQS6tFn" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/decisionintelligence/Path-LLM" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">Path-LLM: A Multi-Modal Path Representation Learning by Aligning and Fusing with Large Language Models</h5>
<p class="card-text mb-auto author-name">Yongfu Wei*, <strong>Yan Lin*</strong>, Hongfan Gao, Ronghui Xu, Sean Bin Yang, Jilin Hu</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
AAAI<span class='text-muted'> | </span>2025
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2408.12809" target="_blank" rel="noopener noreferrer">Preprint</a>
</div>
</div>
<h5 class="mb-1 paper-title">DutyTTE: Deciphering Uncertainty in Origin-Destination Travel Time Estimation</h5>
<p class="card-text mb-auto author-name">Xiaowei Mao*, <strong>Yan Lin*</strong>, Shengnan Guo, Yubin Chen, Xingyu Xian, Haomin Wen, Qisen Xu, Youfang Lin, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
NeurIPS<span class='text-muted'> | </span>2024
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://openreview.net/forum?id=0feJEykDRx" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://neurips.cc/virtual/2024/poster/96914" target="_blank" rel="noopener noreferrer">Poster</a>
</div>
</div>
<h5 class="mb-1 paper-title">Mobility-LLM: Learning Visiting Intentions and Travel Preference from Human Mobility Data with Large Language Models</h5>
<p class="card-text mb-auto author-name">Letian Gong*, <strong>Yan Lin*</strong>, Xinyue Zhang, Yiwen Lu, Xuedi Han, Yichen Liu, Shengnan Guo, Youfang Lin, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
SIGMOD<span class='text-muted'> | </span>2024
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://dl.acm.org/doi/10.1145/3617337" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2307.03048" target="_blank" rel="noopener noreferrer">Preprint</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/Logan-Lin/DOT" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">Origin-Destination Travel Time Oracle for Map-based Services</h5>
<p class="card-text mb-auto author-name"><strong>Yan Lin</strong>, Huaiyu Wan, Jilin Hu, Shengnan Guo, Bin Yang, Christian S. Jensen, Youfang Lin</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
IEEE TKDE<span class='text-muted'> | </span>2023
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://ieeexplore.ieee.org/abstract/document/10375102" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2207.14539" target="_blank" rel="noopener noreferrer">Preprint</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/Logan-Lin/MMTEC" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">Pre-training General Trajectory Embeddings with Maximum Multi-view Entropy Coding</h5>
<p class="card-text mb-auto author-name"><strong>Yan Lin</strong>, Huaiyu Wan, Shengnan Guo, Jilin Hu, Christian S. Jensen, Youfang Lin</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
IEEE TKDE<span class='text-muted'> | </span>2022
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://ieeexplore.ieee.org/abstract/document/9351627" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/Logan-Lin/TALE" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">Pre-training Time-aware location embeddings from spatial-temporal trajectories</h5>
<p class="card-text mb-auto author-name">Huaiyu Wan, <strong>Yan Lin</strong>, Shengnan Guo, Youfang Lin</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
AAAI<span class='text-muted'> | </span>2021
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://ojs.aaai.org/index.php/AAAI/article/view/16548" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/Logan-Lin/CTLE" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">Pre-training Context and Time Aware Location Embeddings from Spatial-Temporal Trajectories for User Next Location Prediction</h5>
<p class="card-text mb-auto author-name"><strong>Yan Lin</strong>, Huaiyu Wan, Shengnan Guo, Youfang Lin</p>
</div>
</div>
<hr class="my-2">
<div id="secondary-publications" class="list-group list-group-flush">
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
KDD<span class='text-muted'> | </span>2025
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2412.10859" target="_blank" rel="noopener noreferrer">Preprint</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/decisionintelligence/DUET" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">DUET: Dual Clustering Enhanced Multivariate Time Series Forecasting</h5>
<p class="card-text mb-auto author-name">Xiangfei Qiu, Xingjian Wu, <strong>Yan Lin</strong>, Chenjuan Guo, Jilin Hu, Bin Yang</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
IEEE TKDE<span class='text-muted'> | </span>2024
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://www.computer.org/csdl/journal/tk/5555/01/10679607/20b3hlbjBOo" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2402.07369" target="_blank" rel="noopener noreferrer">Preprint</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/wtl52656/Diff-RNTraj" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">Diff-RNTraj: A Structure-aware Diffusion Model for Road Network-constrained Trajectory Generation</h5>
<p class="card-text mb-auto author-name">Tonglong Wei, Youfang Lin, Shengnan Guo, <strong>Yan Lin</strong>, Yiheng Huang, Chenyang Xiang, Yuqing Bai, Menglu Ya, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
IEEE TKDE<span class='text-muted'> | </span>2024
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://ieeexplore.ieee.org/document/10836764" target="_blank" rel="noopener noreferrer">Paper</a>
</div>
</div>
<h5 class="mb-1 paper-title">STCDM: Spatio-Temporal Contrastive Diffusion Model for Check-In Sequence Generation</h5>
<p class="card-text mb-auto author-name">Letian Gong, Shengnan Guo, <strong>Yan Lin</strong>, Yichen Liu, Erwen Zheng, Yiwei Shuang, Youfang Lin, Jilin Hu, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
IEEE TKDE<span class='text-muted'> | </span>2024
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://www.computer.org/csdl/journal/tk/5555/01/10517676/1WCj0j0FljW" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2404.19141" target="_blank" rel="noopener noreferrer">Preprint</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/wtl52656/MM-STGED" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">Micro-Macro Spatial-Temporal Graph-based Encoder-Decoder for Map-Constrained Trajectory Recovery</h5>
<p class="card-text mb-auto author-name">Tonglong Wei, Youfang Lin, <strong>Yan Lin</strong>, Shengnan Guo, Lan Zhang, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
KBS<span class='text-muted'> | </span>2024
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://www.sciencedirect.com/science/article/pii/S0950705123010730" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/wtl52656/IAGCN" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">Inductive and Adaptive Graph Convolution Networks Equipped with Constraint Task for Spatial-Temporal Traffic Data Kriging</h5>
<p class="card-text mb-auto author-name">Tonglong Wei, Youfang Lin, Shengnan Guo, <strong>Yan Lin</strong>, Yiji Zhao, Xiyuan Jin, Zhihao Wu, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
IEEE TKDE<span class='text-muted'> | </span>2024
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2407.15899" target="_blank" rel="noopener noreferrer">Preprint</a>
</div>
</div>
<h5 class="mb-1 paper-title">Spatial-Temporal Cross-View Contrastive Pre-Training for Check-in Sequence Representation Learning</h5>
<p class="card-text mb-auto author-name">Letian Gong, Huaiyu Wan, Shengnan Guo, Li Xiucheng, <strong>Yan Lin</strong>, Erwen Zheng, Tianyi Wang, Zeyu Zhou, Youfang Lin</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
AAAI<span class='text-muted'> | </span>2023
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://ojs.aaai.org/index.php/AAAI/article/view/25546" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/LetianGong/CACSR" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">Contrastive Pre-training with Adversarial Perturbations for Check-In Sequence Representation Learning</h5>
<p class="card-text mb-auto author-name">Letian Gong, Youfang Lin, Shengnan Guo, <strong>Yan Lin</strong>, Tianyi Wang, Erwen Zheng, Zeyu Zhou, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
ESWA<span class='text-muted'> | </span>2023
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://www.sciencedirect.com/science/article/pii/S0957417423012241" target="_blank" rel="noopener noreferrer">Paper</a>
</div>
</div>
<h5 class="mb-1 paper-title">Adversarial Self-Attentive Time-Variant Neural Networks for Multi-Step Time Series Forecasting</h5>
<p class="card-text mb-auto author-name">Changxia Gao, Ning Zhang, Youru Li, <strong>Yan Lin</strong>, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
APIN<span class='text-muted'> | </span>2023
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://link.springer.com/article/10.1007/s10489-023-05057-7" target="_blank" rel="noopener noreferrer">Paper</a>
</div>
</div>
<h5 class="mb-1 paper-title">Multi-scale Adaptive Attention-based Time-Variant Neural Networks for Multi-step Time Series Forecasting</h5>
<p class="card-text mb-auto author-name">Changxia Gao, Ning Zhang, Youru Li, <strong>Yan Lin</strong>, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
NeurIPS<span class='text-muted'> | </span>2023
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://openreview.net/forum?id=y08bkEtNBK" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/Water2sea/WITRAN" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">WITRAN: Water-wave Information Transmission and Recurrent Acceleration Network for Long-range Time Series Forecasting</h5>
<p class="card-text mb-auto author-name">Yuxin Jia, Youfang Lin, Xinyan Hao, <strong>Yan Lin</strong>, Shengnan Guo, Huaiyu Wan</p>
</div>
</div>
</div>
<div class="text-start mt-1">
<small class="text-muted" style="font-size: 0.8rem;">* Equal Contribution</small>
</div>
</article>
<article class="section" id="projects">
<div class="d-flex justify-content-between align-items-center mb-1">
<h2 class="section-title d-inline-block mb-0"><i class="bi bi-code-slash"></i> Projects</h2>
<a class="mb-0 link link-secondary link-underline-opacity-0 h5" href="/projects/">View All <i class="bi bi-arrow-right-circle"></i></a>
</div>
<div>
<div id="primary-projects" class="list-group list-group-flush">
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Fundamental Research Funds for the Central Universities of China
</p>
<div class="d-flex gap-2">
</div>
</div>
<h5 class="mb-1 paper-title">Research on <i>Prediction of User Travel Destination and Travel Time Based on Trajectory Representation Learning</i></h5>
<p class="card-text mb-auto project-desc">Applies representation learning to trajectory data to transform original features into high-level information, improving the performance of downstream tasks such as travel time and destination prediction.</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Personal Interest Project
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://www.overleafcopilot.com/" target="_blank" rel="noopener noreferrer">Home</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://chromewebstore.google.com/detail/overleaf-copilot/eoadabdpninlhkkbhngoddfjianhlghb" target="_blank" rel="noopener noreferrer">Install</a>
</div>
</div>
<h5 class="mb-1 paper-title">Development of <i>OverleafCopilot - Empowering Academic Writing in Overleaf with Large Language Models</i></h5>
<p class="card-text mb-auto project-desc">This project aims to develop a Browser extension to seamlessly integrate Large Language Models (such as ChatGPT) into the popular online academic writing platform, Overleaf.</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Personal Interest Project
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://www.promptgenius.site/" target="_blank" rel="noopener noreferrer">Website</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/wenhaomin/ChatGPT-PromptGenius" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">Development of <i>PromptGenius - All-purpose prompts for LLMs</i></h5>
<p class="card-text mb-auto project-desc">This project focuses on developing a website that offers a wide range of prompt categories, enhancing the versatility of LLMs for various tasks and improving their output quality.</p>
</div>
</div>
<hr class="my-2">
<div id="secondary-projects" class="list-group list-group-flush">
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
Villum Foundation
</p>
<div class="d-flex gap-2">
</div>
</div>
<h5 class="mb-1 paper-title">Research on <i>Inverse Design of Materials Using Diffusion Probabilistic Models</i></h5>
<p class="card-text mb-auto project-desc">This project focuses on developing diffusion probabilistic models to first understand the relationship between chemistry/structure and material properties, then enable the inverse design of new materials with specific properties. This project currently supports my postdoctoral research position.</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
National Natural Science Foundation of China
</p>
<div class="d-flex gap-2">
</div>
</div>
<h5 class="mb-1 paper-title">Research on <i>Pre-training Representation Learning Methods of Spatial-temporal Trajectory Data for Traffic Prediction</i></h5>
<p class="card-text mb-auto project-desc">This project aims to propose pre-training representation learning methods for spatial-temporal trajectory data, modeling multiple features to improve traffic prediction tasks. It demonstrates how trajectory representation learning can enhance traffic data mining.</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
National Natural Science Foundation of China
</p>
<div class="d-flex gap-2">
</div>
</div>
<h5 class="mb-1 paper-title">Research on <i>Spatial-temporal Trajectory Generation and Representation Learning Methods for Sparsity Problems</i></h5>
<p class="card-text mb-auto project-desc">This project explores how to generate high-quality spatial-temporal trajectory data and corresponding representations to address sparsity-related issues, thereby supporting a variety of downstream tasks.</p>
</div>
</div>
</div>
</article>
<article class="section" id="presentations">
<div class="d-flex justify-content-between align-items-center mb-1">
<h2 class="section-title d-inline-block mb-0"><i class="bi bi-easel"></i> Presentations</h2>
<a class="mb-0 link link-secondary link-underline-opacity-0 h5" href="/presentations/">View All <i class="bi bi-arrow-right-circle"></i></a>
</div>
<div class="list-group list-group-flush" id="presentation-list">
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Guest lecture<span class='text-muted'> | </span>Aalborg University
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="/assets/Self-supervised Learning of Trajectory Data.pdf" target="_blank" rel="noopener noreferrer">Slides</a>
</div>
</div>
<h5 class="mb-1 paper-title">Self-supervised Learning of Trajectory Data</h5>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Workshop presentation<span class='text-muted'> | </span>KDD 2024
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="/assets/KDD_2024_Workshop_PLM4Traj.pdf" target="_blank" rel="noopener noreferrer">Slides</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2405.12459" target="_blank" rel="noopener noreferrer">Paper</a>
</div>
</div>
<h5 class="mb-1 paper-title">PLM4Traj: Leveraging Pre-trained Language Models for Cognizing Movement Patterns and Travel Purposes from Trajectories</h5>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Paper Oral<span class='text-muted'> | </span>SIGMOD 2024
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="/assets/SIGMOD-Oral-PPT.pdf" target="_blank" rel="noopener noreferrer">Slides</a>
</div>
</div>
<h5 class="mb-1 paper-title">Origin-Destination Travel Time Oracle for Map-based Services</h5>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Tutorial<span class='text-muted'> | </span>SpatialDI 2024
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="/assets/Talk on SpatialDI 2024.pdf" target="_blank" rel="noopener noreferrer">Slides</a>
</div>
</div>
<h5 class="mb-1 paper-title">Self-supervised Learning of Spatial-temporal Trajectories</h5>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Paper Oral<span class='text-muted'> | </span>AAAI 2021
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="/assets/AAAI21 Oral PPT.pdf" target="_blank" rel="noopener noreferrer">Slides</a>
</div>
</div>
<h5 class="mb-1 paper-title">Pre-training Context and Time Aware Location Embeddings from Spatial-Temporal Trajectories for User Next Location Prediction</h5>
</div>
</div>
</article>
<article id="services" class="rounded text-body-emphasis bg-body-secondary flex-md-row my-4 position-relative p-4 transition-shadow" style="transition: box-shadow 0.2s ease-in-out;" onmouseover="this.classList.add('shadow-sm')" onmouseout="this.classList.remove('shadow-sm')">
<h2 class="mb-3"><i class="bi bi-person-lines-fill"></i> Services</h2>
<div id="service-list">
<ul class="list ps-3">
<li>IEEE, ACM member</li>
<li>Secretary of IEEE (Denmark Section) Computer Society</li>
<li>Reviewer for journals including TIST, TII, and TVT</li>
<li>Member of program committees of ICLR, KDD, AAAI, CVPR, ICCV, IJCAI, and WWW</li>
</ul>
</div>
</article>
<article class="section" id="blog">
<div class="d-flex justify-content-between align-items-center mb-1">
<h2 class="section-title d-inline-block mb-0"><i class="bi bi-newspaper"></i> Blog</h2>
<a class="mb-0 link link-secondary link-underline-opacity-0 h5" href="/blog/">View All <i class="bi bi-arrow-right-circle"></i></a>
</div>
<div class="list-group list-group-flush" id="blog-list">
<div class="list-group-item px-0">
<a class="mb-1 paper-title blog-link" href="/blog/html/one-step-diffusion-models.html">One Step Diffusion Models</a> | <span class="paper-title text-muted">May 2025</span>
<p class="card-text mb-auto tldr">Despite the promising performance of diffusion models on continuous modality generation, one deficiency that is holding them back is their requirement for multi-step denoising processes, which can be computationally expensive. In this article, we examine recent works that aim to build diffusion models capable of performing sampling in one or a few steps.</p>
</div>
<div class="list-group-item px-0">
<a class="mb-1 paper-title blog-link" href="/blog/html/multi-modal-transformer.html">Multi-modal and Multi-function Transformers</a> | <span class="paper-title text-muted">April 2025</span>
<p class="card-text mb-auto tldr">Multi-modal and multi-function Transformers enables a single architecture to process diverse data types such as language, images, and videos simultaneously. These models employ techniques like vector quantization and lookup-free quantization to map different modalities into a unified embedding space, allowing the Transformer to handle them within the same sequence. Beyond processing multiple data types, these architectures can also combine different functionalities-such as auto-regressive language generation and diffusion-based image creation-within a single model.</p>
</div>
</div>
</article>
</main>
<footer>
<div class="container">
<p class="text-center text-secondary" style="font-size: 0.8rem; font-family: 'Lato', sans-serif;">
Copyright © 2025. Designed and implemented by Yan Lin.
</p>
</div>
</footer>
<button id="back-to-top" class="btn btn-light rounded-circle" style="position: fixed; bottom: 20px; right: 20px; display: none; z-index: 1000; width: 40px; height: 40px; padding: 0;"><i class="bi bi-chevron-up"></i></button>
<script>
// Show or hide the back-to-top button
window.addEventListener('scroll', function() {
var backToTopButton = document.getElementById('back-to-top');
if (window.scrollY > 100) {
backToTopButton.style.display = 'block';
} else {
backToTopButton.style.display = 'none';
}
});
// Scroll to top when the button is clicked
document.getElementById('back-to-top').addEventListener('click', function(e) {
e.preventDefault();
window.scrollTo({
top: 0,
behavior: 'smooth'
});
window.location.href = '#';
return false;
});
</script>
</body>
</html>

BIN
dist/logo.webp vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

158
dist/presentations/index.html vendored Normal file
View file

@ -0,0 +1,158 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Yan Lin's Presentations</title>
<link rel="icon" href="/logo.webp" type="image/x-icon">
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.7.2/font/bootstrap-icons.css" rel="stylesheet">
<link rel="stylesheet" href="/index.css">
</head>
<body>
<main class="container">
<header class="border-bottom lh-1 py-3 border-secondary">
<div class="row flex-nowrap justify-content-between align-items-center">
<div class="col-2">
<a class="link-secondary header-icon px-2 h4" href="/"><i class="bi bi-house-fill"></i></a>
</div>
<div class="col-8 text-center">
<div class="page-header-logo h2 m-0 fw-bold" style="font-family: 'Abril Fatface', serif;">Yan Lin's Presentations</div>
</div>
<div class="col-2 text-end">
</div>
</div>
</header>
<article class="section mt-4">
<div class="list-group list-group-flush">
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Guest lecture<span class='text-muted'> | </span>Aalborg University
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="/assets/Self-supervised Learning of Trajectory Data.pdf" target="_blank" rel="noopener noreferrer">Slides</a>
</div>
</div>
<h5 class="mb-1 paper-title">Self-supervised Learning of Trajectory Data</h5>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Workshop presentation<span class='text-muted'> | </span>KDD 2024
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="/assets/KDD_2024_Workshop_PLM4Traj.pdf" target="_blank" rel="noopener noreferrer">Slides</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2405.12459" target="_blank" rel="noopener noreferrer">Paper</a>
</div>
</div>
<h5 class="mb-1 paper-title">PLM4Traj: Leveraging Pre-trained Language Models for Cognizing Movement Patterns and Travel Purposes from Trajectories</h5>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Paper Oral<span class='text-muted'> | </span>SIGMOD 2024
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="/assets/SIGMOD-Oral-PPT.pdf" target="_blank" rel="noopener noreferrer">Slides</a>
</div>
</div>
<h5 class="mb-1 paper-title">Origin-Destination Travel Time Oracle for Map-based Services</h5>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Tutorial<span class='text-muted'> | </span>SpatialDI 2024
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="/assets/Talk on SpatialDI 2024.pdf" target="_blank" rel="noopener noreferrer">Slides</a>
</div>
</div>
<h5 class="mb-1 paper-title">Self-supervised Learning of Spatial-temporal Trajectories</h5>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Paper Oral<span class='text-muted'> | </span>AAAI 2021
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="/assets/AAAI21 Oral PPT.pdf" target="_blank" rel="noopener noreferrer">Slides</a>
</div>
</div>
<h5 class="mb-1 paper-title">Pre-training Context and Time Aware Location Embeddings from Spatial-Temporal Trajectories for User Next Location Prediction</h5>
</div>
</div>
</article>
</main>
<footer>
<div class="container">
<p class="text-center text-secondary" style="font-size: 0.8rem; font-family: 'Lato', sans-serif;">
Copyright © 2025. Designed and implemented by Yan Lin.
</p>
</div>
</footer>
<button id="back-to-top" class="btn btn-light rounded-circle" style="position: fixed; bottom: 20px; right: 20px; display: none; z-index: 1000; width: 40px; height: 40px; padding: 0;"><i class="bi bi-chevron-up"></i></button>
<script>
// Show or hide the back-to-top button
window.addEventListener('scroll', function() {
var backToTopButton = document.getElementById('back-to-top');
if (window.scrollY > 100) {
backToTopButton.style.display = 'block';
} else {
backToTopButton.style.display = 'none';
}
});
// Scroll to top when the button is clicked
document.getElementById('back-to-top').addEventListener('click', function(e) {
e.preventDefault();
window.scrollTo({
top: 0,
behavior: 'smooth'
});
window.location.href = '#';
return false;
});
</script>
</body>
</html>

BIN
dist/profile.webp vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 722 KiB

206
dist/projects/index.html vendored Normal file
View file

@ -0,0 +1,206 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Yan Lin's Projects</title>
<link rel="icon" href="/logo.webp" type="image/x-icon">
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.7.2/font/bootstrap-icons.css" rel="stylesheet">
<link rel="stylesheet" href="/index.css">
</head>
<body>
<main class="container">
<header class="border-bottom lh-1 py-3 border-secondary">
<div class="row flex-nowrap justify-content-between align-items-center">
<div class="col-2">
<a class="link-secondary header-icon px-2 h4" href="/"><i class="bi bi-house-fill"></i></a>
</div>
<div class="col-8 text-center">
<div class="page-header-logo h2 m-0 fw-bold" style="font-family: 'Abril Fatface', serif;">Yan Lin's Projects</div>
</div>
<div class="col-2 text-end">
</div>
</div>
<!-- <nav class="navbar navbar-expand">
<ul class="navbar-nav d-flex justify-content-evenly mx-auto gap-3 gap-md-5">
<li class="nav-item">
<a class="link nav-link px-0" href="/#publications">Publications</a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#projects">Projects</a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#presentations">Presentations</a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#services">Services</a>
</li>
</ul>
</nav> -->
</header>
<article class="section mt-4">
<h2 class="section-title mb-3"><i class="bi bi-star-fill"></i> Primary Projects</h2>
<div class="list-group list-group-flush mb-4">
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Fundamental Research Funds for the Central Universities of China
</p>
<div class="d-flex gap-2">
</div>
</div>
<h5 class="mb-1 paper-title">Research on <i>Prediction of User Travel Destination and Travel Time Based on Trajectory Representation Learning</i></h5>
<p class="card-text mb-auto project-desc">Applies representation learning to trajectory data to transform original features into high-level information, improving the performance of downstream tasks such as travel time and destination prediction.</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Personal Interest Project
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://www.overleafcopilot.com/" target="_blank" rel="noopener noreferrer">Home</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://chromewebstore.google.com/detail/overleaf-copilot/eoadabdpninlhkkbhngoddfjianhlghb" target="_blank" rel="noopener noreferrer">Install</a>
</div>
</div>
<h5 class="mb-1 paper-title">Development of <i>OverleafCopilot - Empowering Academic Writing in Overleaf with Large Language Models</i></h5>
<p class="card-text mb-auto project-desc">This project aims to develop a Browser extension to seamlessly integrate Large Language Models (such as ChatGPT) into the popular online academic writing platform, Overleaf.</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
Personal Interest Project
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://www.promptgenius.site/" target="_blank" rel="noopener noreferrer">Website</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/wenhaomin/ChatGPT-PromptGenius" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">Development of <i>PromptGenius - All-purpose prompts for LLMs</i></h5>
<p class="card-text mb-auto project-desc">This project focuses on developing a website that offers a wide range of prompt categories, enhancing the versatility of LLMs for various tasks and improving their output quality.</p>
</div>
</div>
<h2 class="section-title mb-3 mt-4"><i class="bi bi-star"></i> Secondary Projects</h2>
<div class="list-group list-group-flush">
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
Villum Foundation
</p>
<div class="d-flex gap-2">
</div>
</div>
<h5 class="mb-1 paper-title">Research on <i>Inverse Design of Materials Using Diffusion Probabilistic Models</i></h5>
<p class="card-text mb-auto project-desc">This project focuses on developing diffusion probabilistic models to first understand the relationship between chemistry/structure and material properties, then enable the inverse design of new materials with specific properties. This project currently supports my postdoctoral research position.</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
National Natural Science Foundation of China
</p>
<div class="d-flex gap-2">
</div>
</div>
<h5 class="mb-1 paper-title">Research on <i>Pre-training Representation Learning Methods of Spatial-temporal Trajectory Data for Traffic Prediction</i></h5>
<p class="card-text mb-auto project-desc">This project aims to propose pre-training representation learning methods for spatial-temporal trajectory data, modeling multiple features to improve traffic prediction tasks. It demonstrates how trajectory representation learning can enhance traffic data mining.</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
National Natural Science Foundation of China
</p>
<div class="d-flex gap-2">
</div>
</div>
<h5 class="mb-1 paper-title">Research on <i>Spatial-temporal Trajectory Generation and Representation Learning Methods for Sparsity Problems</i></h5>
<p class="card-text mb-auto project-desc">This project explores how to generate high-quality spatial-temporal trajectory data and corresponding representations to address sparsity-related issues, thereby supporting a variety of downstream tasks.</p>
</div>
</div>
</article>
</main>
<footer>
<div class="container">
<p class="text-center text-secondary" style="font-size: 0.8rem; font-family: 'Lato', sans-serif;">
Copyright © 2025. Designed and implemented by Yan Lin.
</p>
</div>
</footer>
<button id="back-to-top" class="btn btn-light rounded-circle" style="position: fixed; bottom: 20px; right: 20px; display: none; z-index: 1000; width: 40px; height: 40px; padding: 0;"><i class="bi bi-chevron-up"></i></button>
<script>
// Show or hide the back-to-top button
window.addEventListener('scroll', function() {
var backToTopButton = document.getElementById('back-to-top');
if (window.scrollY > 100) {
backToTopButton.style.display = 'block';
} else {
backToTopButton.style.display = 'none';
}
});
// Scroll to top when the button is clicked
document.getElementById('back-to-top').addEventListener('click', function(e) {
e.preventDefault();
window.scrollTo({
top: 0,
behavior: 'smooth'
});
window.location.href = '#';
return false;
});
</script>
</body>
</html>

492
dist/publications/index.html vendored Normal file
View file

@ -0,0 +1,492 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Yan Lin's Publications</title>
<link rel="icon" href="/logo.webp" type="image/x-icon">
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.7.2/font/bootstrap-icons.css" rel="stylesheet">
<link rel="stylesheet" href="/index.css">
</head>
<body>
<main class="container">
<header class="border-bottom lh-1 py-3 border-secondary">
<div class="row flex-nowrap justify-content-between align-items-center">
<div class="col-2">
<a class="link-secondary header-icon px-2 h4" href="/"><i class="bi bi-house-fill"></i></a>
</div>
<div class="col-8 text-center">
<div class="page-header-logo h2 m-0 fw-bold" style="font-family: 'Abril Fatface', serif;">Yan Lin's Publications</div>
</div>
<div class="col-2 text-end">
</div>
</div>
<!-- <nav class="navbar navbar-expand">
<ul class="navbar-nav d-flex justify-content-evenly mx-auto gap-3 gap-md-5">
<li class="nav-item">
<a class="link nav-link px-0" href="/#publications">Publications</a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#projects">Projects</a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#presentations">Presentations</a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#services">Services</a>
</li>
</ul>
</nav> -->
</header>
<article class="section mt-4">
<h2 class="section-title mb-3"><i class="bi bi-star-fill"></i> Primary Publications</h2>
<div class="list-group list-group-flush mb-4">
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
IEEE TKDE<span class='text-muted'> | </span>2025
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2402.07232" target="_blank" rel="noopener noreferrer">Preprint</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/Logan-Lin/UVTM" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">UVTM: Universal Vehicle Trajectory Modeling with ST Feature Domain Generation</h5>
<p class="card-text mb-auto author-name">Yan Lin, Jilin Hu, Shengnan Guo, Bin Yang, Christian S. Jensen, Youfang Lin, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
IJCAI<span class='text-muted'> | </span>2025
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2405.12459" target="_blank" rel="noopener noreferrer">Preprint</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/Zeru19/PLM4Traj" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">TrajCogn: Leveraging LLMs for Cognizing Movement Patterns and Travel Purposes from Trajectories</h5>
<p class="card-text mb-auto author-name">Zeyu Zhou*, <strong>Yan Lin*</strong>, Haomin Wen, Shengnan Guo, Jilin Hu, Youfang Lin, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
IEEE TKDE<span class='text-muted'> | </span>2025
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://ieeexplore.ieee.org/document/10818577" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2407.12550" target="_blank" rel="noopener noreferrer">Preprint</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/Logan-Lin/UniTE" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">UniTE: A Survey and Unified Pipeline for Pre-training Spatiotemporal Trajectory Embeddings</h5>
<p class="card-text mb-auto author-name"><strong>Yan Lin</strong>, Zeyu Zhou, Yicheng Liu, Haochen Lv, Haomin Wen, Tianyi Li, Yushuai Li, Christian S. Jensen, Shengnan Guo, Youfang Lin, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
WWW<span class='text-muted'> | </span>2025
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://openreview.net/forum?id=KmMSQS6tFn" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/decisionintelligence/Path-LLM" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">Path-LLM: A Multi-Modal Path Representation Learning by Aligning and Fusing with Large Language Models</h5>
<p class="card-text mb-auto author-name">Yongfu Wei*, <strong>Yan Lin*</strong>, Hongfan Gao, Ronghui Xu, Sean Bin Yang, Jilin Hu</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
AAAI<span class='text-muted'> | </span>2025
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2408.12809" target="_blank" rel="noopener noreferrer">Preprint</a>
</div>
</div>
<h5 class="mb-1 paper-title">DutyTTE: Deciphering Uncertainty in Origin-Destination Travel Time Estimation</h5>
<p class="card-text mb-auto author-name">Xiaowei Mao*, <strong>Yan Lin*</strong>, Shengnan Guo, Yubin Chen, Xingyu Xian, Haomin Wen, Qisen Xu, Youfang Lin, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
NeurIPS<span class='text-muted'> | </span>2024
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://openreview.net/forum?id=0feJEykDRx" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://neurips.cc/virtual/2024/poster/96914" target="_blank" rel="noopener noreferrer">Poster</a>
</div>
</div>
<h5 class="mb-1 paper-title">Mobility-LLM: Learning Visiting Intentions and Travel Preference from Human Mobility Data with Large Language Models</h5>
<p class="card-text mb-auto author-name">Letian Gong*, <strong>Yan Lin*</strong>, Xinyue Zhang, Yiwen Lu, Xuedi Han, Yichen Liu, Shengnan Guo, Youfang Lin, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
SIGMOD<span class='text-muted'> | </span>2024
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://dl.acm.org/doi/10.1145/3617337" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2307.03048" target="_blank" rel="noopener noreferrer">Preprint</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/Logan-Lin/DOT" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">Origin-Destination Travel Time Oracle for Map-based Services</h5>
<p class="card-text mb-auto author-name"><strong>Yan Lin</strong>, Huaiyu Wan, Jilin Hu, Shengnan Guo, Bin Yang, Christian S. Jensen, Youfang Lin</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
IEEE TKDE<span class='text-muted'> | </span>2023
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://ieeexplore.ieee.org/abstract/document/10375102" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2207.14539" target="_blank" rel="noopener noreferrer">Preprint</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/Logan-Lin/MMTEC" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">Pre-training General Trajectory Embeddings with Maximum Multi-view Entropy Coding</h5>
<p class="card-text mb-auto author-name"><strong>Yan Lin</strong>, Huaiyu Wan, Shengnan Guo, Jilin Hu, Christian S. Jensen, Youfang Lin</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
IEEE TKDE<span class='text-muted'> | </span>2022
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://ieeexplore.ieee.org/abstract/document/9351627" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/Logan-Lin/TALE" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">Pre-training Time-aware location embeddings from spatial-temporal trajectories</h5>
<p class="card-text mb-auto author-name">Huaiyu Wan, <strong>Yan Lin</strong>, Shengnan Guo, Youfang Lin</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
AAAI<span class='text-muted'> | </span>2021
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://ojs.aaai.org/index.php/AAAI/article/view/16548" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/Logan-Lin/CTLE" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">Pre-training Context and Time Aware Location Embeddings from Spatial-Temporal Trajectories for User Next Location Prediction</h5>
<p class="card-text mb-auto author-name"><strong>Yan Lin</strong>, Huaiyu Wan, Shengnan Guo, Youfang Lin</p>
</div>
</div>
<h2 class="section-title mb-3 mt-4"><i class="bi bi-star"></i> Secondary Publications</h2>
<div class="list-group list-group-flush">
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
KDD<span class='text-muted'> | </span>2025
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2412.10859" target="_blank" rel="noopener noreferrer">Preprint</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/decisionintelligence/DUET" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">DUET: Dual Clustering Enhanced Multivariate Time Series Forecasting</h5>
<p class="card-text mb-auto author-name">Xiangfei Qiu, Xingjian Wu, <strong>Yan Lin</strong>, Chenjuan Guo, Jilin Hu, Bin Yang</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
IEEE TKDE<span class='text-muted'> | </span>2024
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://www.computer.org/csdl/journal/tk/5555/01/10679607/20b3hlbjBOo" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2402.07369" target="_blank" rel="noopener noreferrer">Preprint</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/wtl52656/Diff-RNTraj" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">Diff-RNTraj: A Structure-aware Diffusion Model for Road Network-constrained Trajectory Generation</h5>
<p class="card-text mb-auto author-name">Tonglong Wei, Youfang Lin, Shengnan Guo, <strong>Yan Lin</strong>, Yiheng Huang, Chenyang Xiang, Yuqing Bai, Menglu Ya, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
IEEE TKDE<span class='text-muted'> | </span>2024
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://ieeexplore.ieee.org/document/10836764" target="_blank" rel="noopener noreferrer">Paper</a>
</div>
</div>
<h5 class="mb-1 paper-title">STCDM: Spatio-Temporal Contrastive Diffusion Model for Check-In Sequence Generation</h5>
<p class="card-text mb-auto author-name">Letian Gong, Shengnan Guo, <strong>Yan Lin</strong>, Yichen Liu, Erwen Zheng, Yiwei Shuang, Youfang Lin, Jilin Hu, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
IEEE TKDE<span class='text-muted'> | </span>2024
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://www.computer.org/csdl/journal/tk/5555/01/10517676/1WCj0j0FljW" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2404.19141" target="_blank" rel="noopener noreferrer">Preprint</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/wtl52656/MM-STGED" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">Micro-Macro Spatial-Temporal Graph-based Encoder-Decoder for Map-Constrained Trajectory Recovery</h5>
<p class="card-text mb-auto author-name">Tonglong Wei, Youfang Lin, <strong>Yan Lin</strong>, Shengnan Guo, Lan Zhang, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
KBS<span class='text-muted'> | </span>2024
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://www.sciencedirect.com/science/article/pii/S0950705123010730" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/wtl52656/IAGCN" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">Inductive and Adaptive Graph Convolution Networks Equipped with Constraint Task for Spatial-Temporal Traffic Data Kriging</h5>
<p class="card-text mb-auto author-name">Tonglong Wei, Youfang Lin, Shengnan Guo, <strong>Yan Lin</strong>, Yiji Zhao, Xiyuan Jin, Zhihao Wu, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
IEEE TKDE<span class='text-muted'> | </span>2024
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://arxiv.org/abs/2407.15899" target="_blank" rel="noopener noreferrer">Preprint</a>
</div>
</div>
<h5 class="mb-1 paper-title">Spatial-Temporal Cross-View Contrastive Pre-Training for Check-in Sequence Representation Learning</h5>
<p class="card-text mb-auto author-name">Letian Gong, Huaiyu Wan, Shengnan Guo, Li Xiucheng, <strong>Yan Lin</strong>, Erwen Zheng, Tianyi Wang, Zeyu Zhou, Youfang Lin</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
AAAI<span class='text-muted'> | </span>2023
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://ojs.aaai.org/index.php/AAAI/article/view/25546" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/LetianGong/CACSR" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">Contrastive Pre-training with Adversarial Perturbations for Check-In Sequence Representation Learning</h5>
<p class="card-text mb-auto author-name">Letian Gong, Youfang Lin, Shengnan Guo, <strong>Yan Lin</strong>, Tianyi Wang, Erwen Zheng, Zeyu Zhou, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
ESWA<span class='text-muted'> | </span>2023
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://www.sciencedirect.com/science/article/pii/S0957417423012241" target="_blank" rel="noopener noreferrer">Paper</a>
</div>
</div>
<h5 class="mb-1 paper-title">Adversarial Self-Attentive Time-Variant Neural Networks for Multi-Step Time Series Forecasting</h5>
<p class="card-text mb-auto author-name">Changxia Gao, Ning Zhang, Youru Li, <strong>Yan Lin</strong>, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
APIN<span class='text-muted'> | </span>2023
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://link.springer.com/article/10.1007/s10489-023-05057-7" target="_blank" rel="noopener noreferrer">Paper</a>
</div>
</div>
<h5 class="mb-1 paper-title">Multi-scale Adaptive Attention-based Time-Variant Neural Networks for Multi-step Time Series Forecasting</h5>
<p class="card-text mb-auto author-name">Changxia Gao, Ning Zhang, Youru Li, <strong>Yan Lin</strong>, Huaiyu Wan</p>
</div>
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name secondary-text">
NeurIPS<span class='text-muted'> | </span>2023
</p>
<div class="d-flex gap-2">
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://openreview.net/forum?id=y08bkEtNBK" target="_blank" rel="noopener noreferrer">Paper</a>
<a class="link icon-link icon-link-hover paper-link link-secondary" href="https://github.com/Water2sea/WITRAN" target="_blank" rel="noopener noreferrer">Code</a>
</div>
</div>
<h5 class="mb-1 paper-title">WITRAN: Water-wave Information Transmission and Recurrent Acceleration Network for Long-range Time Series Forecasting</h5>
<p class="card-text mb-auto author-name">Yuxin Jia, Youfang Lin, Xinyan Hao, <strong>Yan Lin</strong>, Shengnan Guo, Huaiyu Wan</p>
</div>
</div>
<div class="text-start mt-3">
<small class="text-muted" style="font-size: 0.8rem;">* Equal Contribution</small>
</div>
</article>
</main>
<footer>
<div class="container">
<p class="text-center text-secondary" style="font-size: 0.8rem; font-family: 'Lato', sans-serif;">
Copyright © 2025. Designed and implemented by Yan Lin.
</p>
</div>
</footer>
<button id="back-to-top" class="btn btn-light rounded-circle" style="position: fixed; bottom: 20px; right: 20px; display: none; z-index: 1000; width: 40px; height: 40px; padding: 0;"><i class="bi bi-chevron-up"></i></button>
<script>
// Show or hide the back-to-top button
window.addEventListener('scroll', function() {
var backToTopButton = document.getElementById('back-to-top');
if (window.scrollY > 100) {
backToTopButton.style.display = 'block';
} else {
backToTopButton.style.display = 'none';
}
});
// Scroll to top when the button is clicked
document.getElementById('back-to-top').addEventListener('click', function(e) {
e.preventDefault();
window.scrollTo({
top: 0,
behavior: 'smooth'
});
window.location.href = '#';
return false;
});
</script>
</body>
</html>

15
docker-compose.yml Normal file
View file

@ -0,0 +1,15 @@
services:
homepage:
image: nginx:alpine
container_name: homepage
ports:
- "9000:80"
volumes:
- ./dist:/usr/share/nginx/html
restart: unless-stopped
networks:
- proxy-network
networks:
proxy-network:
external: true

33
generate.py Normal file
View file

@ -0,0 +1,33 @@
import os
import yaml
from jinja2 import Environment, FileSystemLoader
if __name__ == '__main__':
with open('data.yaml', 'r') as file:
profile_data = yaml.safe_load(file)
env = Environment(loader=FileSystemLoader('templates'))
os.makedirs('dist', exist_ok=True)
os.makedirs('dist/publications', exist_ok=True)
os.makedirs('dist/projects', exist_ok=True)
os.makedirs('dist/presentations', exist_ok=True)
os.makedirs('dist/blog', exist_ok=True)
os.makedirs('dist/blog/html', exist_ok=True)
def render_template(template_name, output_path, **kwargs):
template = env.get_template(template_name)
html = template.render(**kwargs)
with open(output_path, 'w') as file:
file.write(html)
print(f'Generated {output_path}')
render_template('index.html', 'dist/index.html', data=profile_data, is_home_page=True)
render_template('publications.html', 'dist/publications/index.html', data=profile_data, is_home_page=False)
render_template('projects.html', 'dist/projects/index.html', data=profile_data, is_home_page=False)
render_template('presentations.html', 'dist/presentations/index.html', data=profile_data, is_home_page=False)
render_template('blog.html', 'dist/blog/index.html', data=profile_data, is_home_page=False)
print('Static site generation complete!')

168
parser/md.py Normal file
View file

@ -0,0 +1,168 @@
import markdown
import re
import os
import glob
from typing import List
def markdown_to_html_paragraphs(markdown_text: str) -> List[str]:
"""
Convert markdown text into a list of HTML paragraphs.
Supports mathematical equations using LaTeX syntax.
Args:
markdown_text (str): The markdown text to convert
Returns:
List[str]: A list of HTML paragraphs, each wrapped in <p> tags
"""
# Prepend "md/" to image paths if they don't already start with md/
markdown_text = re.sub(r'!\[(.*?)\]\((?!md/)([^/].*?\.assets/.*?)\)', r'![\1](/blog/md/\2)', markdown_text)
# Check if the first line starts with a # for h1 title
lines = markdown_text.split('\n')
has_h1_title = False
bold_title = None
if lines and lines[0].strip().startswith('#'):
has_h1_title = True
title_line = lines[0].strip().lstrip('#').strip()
bold_title = f'<p class="blog-title">{title_line}</p>'
# Remove the title from the markdown to avoid duplicate processing
markdown_text = '\n'.join(lines[1:])
else:
raise ValueError("No title found in the markdown file")
# Configure markdown with math extensions
extensions = [
'markdown.extensions.extra', # For blockquotes and other features
'markdown.extensions.fenced_code', # For code blocks
'markdown.extensions.codehilite', # For syntax highlighting
'markdown.extensions.attr_list', # For attributes
'markdown.extensions.md_in_html', # For markdown inside HTML
'mdx_math', # For math support
]
try:
# Try to use python-markdown-math which outputs compatible with MathJax 3
import pymdownx.arithmatex
extensions.remove('mdx_math')
extensions.append('pymdownx.arithmatex')
extension_configs = {
'pymdownx.arithmatex': {
'generic': True # Uses \(...\) for inline and \[...\] for display math
}
}
except ImportError:
# Fallback to mdx_math
extension_configs = {
'mdx_math': {
'enable_dollar_delimiter': True, # Enable $...$ for inline math
}
}
# Convert markdown to HTML with math support
html = markdown.markdown(
markdown_text,
extensions=extensions,
extension_configs=extension_configs
)
html = re.sub(r'<p>\s*(<img[^>]+>)\s*</p>', r'\1', html, flags=re.IGNORECASE)
# Convert image followed by blockquote to figure with caption
html = re.sub(
r'<img([^>]+)>\s*<blockquote>\s*<p>(.*?)</p>\s*</blockquote>',
r'<figure class="figure">\n <img\1 class="figure-img img-fluid rounded">\n <figcaption class="figure-caption">\2</figcaption>\n</figure>',
html,
flags=re.DOTALL
)
# Add "link" class and target="_blank" to all <a> tags
html = re.sub(r'<a(.*?)>', r'<a\1 class="link" target="_blank">', html)
html = re.sub(r'<a(.*?)class="(.*?)"(.*?)class="(.*?)"(.*?)>', r'<a\1class="\2 \4"\3\5>', html)
html = re.sub(r'<a(.*?)target="(.*?)"(.*?)target="(.*?)"(.*?)>', r'<a\1target="\2"\3\5>', html)
# Split the HTML into paragraphs
paragraphs = html.split('\n\n')
# Clean up and ensure each paragraph is properly wrapped
cleaned_paragraphs = []
# Add the bold title as the first element if it exists
if has_h1_title and bold_title:
cleaned_paragraphs.append(bold_title)
for p in paragraphs:
p = p.strip()
if p:
# If the paragraph doesn't already have <p> tags, add them
if not (p.startswith('<') and not p.startswith('<p>')):
p = f'<p>{p}</p>'
cleaned_paragraphs.append(p)
return cleaned_paragraphs, title_line
def insert_markdown_into_template(template_path: str, markdown_text: str) -> str:
"""
Insert parsed markdown content into the template HTML file.
Args:
template_path (str): Path to the template HTML file
markdown_text (str): The markdown text to convert and insert
Returns:
str: Complete HTML with markdown content inserted
"""
# Parse markdown into HTML paragraphs
html_paragraphs, title_line = markdown_to_html_paragraphs(markdown_text)
# Read the template
with open(template_path, 'r') as f:
template = f.read()
# Join paragraphs into a single string
content_html = '\n'.join(html_paragraphs)
# Insert the content into the template
complete_html = template.replace('{{ content }}', content_html)
# Replace {{ title }} placeholders with the extracted title
complete_html = complete_html.replace('{{ title }}', title_line)
return complete_html
def process_all_markdown_files():
"""
Process all markdown files in blog/md/ directory and generate HTML files in blog/html/.
"""
# Get all markdown files in blog/md/
md_files = glob.glob("dist/blog/md/*.md")
template_path = "dist/blog/template.html"
for md_file in md_files:
# Extract base filename without extension
base_name = os.path.basename(md_file)[:-3] # Remove .md extension
html_file = f"dist/blog/html/{base_name}.html"
print(f"Processing {md_file} -> {html_file}")
try:
# Read the markdown content
with open(md_file, "r") as f:
markdown_text = f.read()
# Generate HTML content
complete_html = insert_markdown_into_template(template_path, markdown_text)
# Write HTML output
with open(html_file, "w") as f:
f.write(complete_html)
except Exception as e:
print(f"Error processing {md_file}: {str(e)}")
if __name__ == "__main__":
process_all_markdown_files()

4
preproduction.sh Normal file
View file

@ -0,0 +1,4 @@
python parser/md.py
python generate.py
cd dist
python -m http.server 8000

4
requirements.txt Normal file
View file

@ -0,0 +1,4 @@
markdown>=3.4.0
python-markdown-math>=0.8
pyyaml>=6.0.2
jinja2>=3.1.6

4
sync.sh Normal file
View file

@ -0,0 +1,4 @@
python parser/md.py
python generate.py
rsync -avP --delete ./dist/ hetzner:~/homepage/dist
rsync -avP ./docker-compose.yml hetzner:~/homepage/

85
templates/base.html Normal file
View file

@ -0,0 +1,85 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{% block title %}Yan Lin{% endblock %}</title>
<link rel="icon" href="/logo.webp" type="image/x-icon">
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.7.2/font/bootstrap-icons.css" rel="stylesheet">
<link rel="stylesheet" href="/index.css">
{% block extra_head %}{% endblock %}
</head>
<body>
<main class="container">
{% if is_home_page %}
<header class="border-bottom lh-1 pt-3 pb-0 border-secondary">
{% else %}
<header class="border-bottom lh-1 py-3 border-secondary">
{% endif %}
<div class="row flex-nowrap justify-content-between align-items-center">
<div class="col-2">
{% block header_left %}
{% if is_home_page %}
<a class="link-secondary header-icon px-2 h4" href="mailto:s@yanlincs.com"><i class="bi bi-envelope-fill"></i></a>
{% else %}
<a class="link-secondary header-icon px-2 h4" href="/"><i class="bi bi-house-fill"></i></a>
{% endif %}
{% endblock %}
</div>
<div class="col-8 text-center">
<div class="page-header-logo h2 m-0 fw-bold" style="font-family: 'Abril Fatface', serif;">{% block header_title %}Yan Lin's Homepage{% endblock %}</div>
</div>
<div class="col-2 text-end">
{% block header_right %}
{% if is_home_page %}
<a class="link-secondary header-icon px-2 h4" href="https://lab.yanlincs.com"><i class="bi bi-stack"></i></a>
{% endif %}
{% endblock %}
</div>
</div>
{% block navigation %}{% endblock %}
</header>
{% block content %}{% endblock %}
</main>
<footer>
<div class="container">
<p class="text-center text-secondary" style="font-size: 0.8rem; font-family: 'Lato', sans-serif;">
Copyright © 2025. Designed and implemented by Yan Lin.
</p>
</div>
</footer>
<button id="back-to-top" class="btn btn-light rounded-circle" style="position: fixed; bottom: 20px; right: 20px; display: none; z-index: 1000; width: 40px; height: 40px; padding: 0;"><i class="bi bi-chevron-up"></i></button>
{% block extra_js %}
<script>
// Show or hide the back-to-top button
window.addEventListener('scroll', function() {
var backToTopButton = document.getElementById('back-to-top');
if (window.scrollY > 100) {
backToTopButton.style.display = 'block';
} else {
backToTopButton.style.display = 'none';
}
});
// Scroll to top when the button is clicked
document.getElementById('back-to-top').addEventListener('click', function(e) {
e.preventDefault();
window.scrollTo({
top: 0,
behavior: 'smooth'
});
window.location.href = '#';
return false;
});
</script>
{% endblock %}
</body>
</html>

18
templates/blog.html Normal file
View file

@ -0,0 +1,18 @@
{% extends 'base.html' %}
{% block title %}Yan Lin's Blog{% endblock %}
{% block header_title %}Yan Lin's Blog{% endblock %}
{% block navigation %}
{% endblock %}
{% block content %}
<article class="section mt-4">
<div class="list-group list-group-flush">
{% for blog in data.blogs %}
{% include 'partials/blog.html' %}
{% endfor %}
</div>
</article>
{% endblock %}

113
templates/index.html Normal file
View file

@ -0,0 +1,113 @@
{% extends 'base.html' %}
{% block title %}Yan Lin's Homepage{% endblock %}
{% block navigation %}
{% include 'partials/navigation.html' %}
{% endblock %}
{% block content %}
<div class="row g-0 border rounded text-body-emphasis bg-body-secondary flex-md-row my-4 position-relative shadow-sm transition-shadow" style="transition: box-shadow 0.2s ease-in-out;" onmouseover="this.classList.remove('shadow-sm'); this.classList.add('shadow')" onmouseout="this.classList.remove('shadow'); this.classList.add('shadow-sm')">
<div class="col p-4 d-flex flex-column position-static">
<h2 class="fst-italic mb-3">Biography - Yan Lin</h2>
<p class="card-text mb-auto" style="font-size: 1.1rem;">
I am currently a postdoctoral researcher in the Department of Computer Science at Aalborg University.
I received my PhD and Bachelor's degrees from Beijing Jiaotong University, China.
My research interests include <i>spatiotemporal data mining</i>, <i>representation learning</i>, and <i>AI for science</i>.
</p>
</div>
<div class="col-5 col-xl-4 col-xxl-3 d-none d-lg-block d-flex align-items-center">
<img src="/profile.webp" alt="Yan Lin" class="rounded w-100" style="object-fit: contain;">
</div>
</div>
<article class="section" id="publications">
<div class="d-flex justify-content-between align-items-center mb-1">
<h2 class="section-title d-inline-block mb-0"><i class="bi bi-book"></i> Publications</h2>
<a class="mb-0 link link-secondary link-underline-opacity-0 h5" href="/publications/">View All <i class="bi bi-arrow-right-circle"></i></a>
</div>
<div>
<div id="primary-publications" class="list-group list-group-flush">
{% for pub in data.primaryPublications[:10] %}
{% with type='primary' %}
{% include 'partials/publication.html' %}
{% endwith %}
{% endfor %}
</div>
<hr class="my-2">
<div id="secondary-publications" class="list-group list-group-flush">
{% for pub in data.secondaryPublications[:10] %}
{% with type='secondary' %}
{% include 'partials/publication.html' %}
{% endwith %}
{% endfor %}
</div>
</div>
<div class="text-start mt-1">
<small class="text-muted" style="font-size: 0.8rem;">* Equal Contribution</small>
</div>
</article>
<article class="section" id="projects">
<div class="d-flex justify-content-between align-items-center mb-1">
<h2 class="section-title d-inline-block mb-0"><i class="bi bi-code-slash"></i> Projects</h2>
<a class="mb-0 link link-secondary link-underline-opacity-0 h5" href="/projects/">View All <i class="bi bi-arrow-right-circle"></i></a>
</div>
<div>
<div id="primary-projects" class="list-group list-group-flush">
{% for project in data.primaryProjects[:3] %}
{% with type='primary' %}
{% include 'partials/project.html' %}
{% endwith %}
{% endfor %}
</div>
<hr class="my-2">
<div id="secondary-projects" class="list-group list-group-flush">
{% for project in data.secondaryProjects[:3] %}
{% with type='secondary' %}
{% include 'partials/project.html' %}
{% endwith %}
{% endfor %}
</div>
</div>
</article>
<article class="section" id="presentations">
<div class="d-flex justify-content-between align-items-center mb-1">
<h2 class="section-title d-inline-block mb-0"><i class="bi bi-easel"></i> Presentations</h2>
<a class="mb-0 link link-secondary link-underline-opacity-0 h5" href="/presentations/">View All <i class="bi bi-arrow-right-circle"></i></a>
</div>
<div class="list-group list-group-flush" id="presentation-list">
{% for presentation in data.presentations[:5] %}
{% include 'partials/presentation.html' %}
{% endfor %}
</div>
</article>
<article id="services" class="rounded text-body-emphasis bg-body-secondary flex-md-row my-4 position-relative p-4 transition-shadow" style="transition: box-shadow 0.2s ease-in-out;" onmouseover="this.classList.add('shadow-sm')" onmouseout="this.classList.remove('shadow-sm')">
<h2 class="mb-3"><i class="bi bi-person-lines-fill"></i> Services</h2>
<div id="service-list">
<ul class="list ps-3">
{% for service in data.services %}
<li>{{ service }}</li>
{% endfor %}
</ul>
</div>
</article>
<article class="section" id="blog">
<div class="d-flex justify-content-between align-items-center mb-1">
<h2 class="section-title d-inline-block mb-0"><i class="bi bi-newspaper"></i> Blog</h2>
<a class="mb-0 link link-secondary link-underline-opacity-0 h5" href="/blog/">View All <i class="bi bi-arrow-right-circle"></i></a>
</div>
<div class="list-group list-group-flush" id="blog-list">
{% for blog in data.blogs[:3] %}
{% include 'partials/blog.html' %}
{% endfor %}
</div>
</article>
{% endblock %}
{% block extra_js %}
{{ super() }}
{% endblock %}

View file

@ -0,0 +1,4 @@
<div class="list-group-item px-0">
<a class="mb-1 paper-title blog-link" href="/blog/html/{{ blog.path }}.html">{{ blog.title }}</a> | <span class="paper-title text-muted">{{ blog.badge }}</span>
<p class="card-text mb-auto tldr">{{ blog.tldr }}</p>
</div>

View file

@ -0,0 +1,16 @@
<nav class="navbar navbar-expand">
<ul class="navbar-nav d-flex justify-content-evenly mx-auto gap-3 gap-md-5">
<li class="nav-item">
<a class="link nav-link px-0" href="/#publications">Publications</a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#projects">Projects</a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#presentations">Presentations</a>
</li>
<li class="nav-item">
<a class="link nav-link px-0" href="/#services">Services</a>
</li>
</ul>
</nav>

View file

@ -0,0 +1,13 @@
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name primary-text">
{{ presentation.tags|join("<span class='text-muted'> | </span>")|safe }}
</p>
<div class="d-flex gap-2">
{% for name, url in presentation.links.items() %}
<a class="link icon-link icon-link-hover paper-link link-secondary" href="{{ url }}" target="_blank" rel="noopener noreferrer">{{ name }}</a>
{% endfor %}
</div>
</div>
<h5 class="mb-1 paper-title">{{ presentation.title|safe }}</h5>
</div>

View file

@ -0,0 +1,14 @@
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name {% if type == 'primary' %}primary-text{% else %}secondary-text{% endif %}">
{{ project.tags|join("<span class='text-muted'> | </span>")|safe }}
</p>
<div class="d-flex gap-2">
{% for name, url in project.links.items() %}
<a class="link icon-link icon-link-hover paper-link link-secondary" href="{{ url }}" target="_blank" rel="noopener noreferrer">{{ name }}</a>
{% endfor %}
</div>
</div>
<h5 class="mb-1 paper-title">{{ project.title|safe }}</h5>
<p class="card-text mb-auto project-desc">{{ project.desc|safe }}</p>
</div>

View file

@ -0,0 +1,14 @@
<div class="list-group-item px-0">
<div class="d-flex justify-content-between align-items-center mb-1">
<p class="d-inline-block mb-0 venue-name {% if type == 'primary' %}primary-text{% else %}secondary-text{% endif %}">
{{ pub.tags|join("<span class='text-muted'> | </span>")|safe }}
</p>
<div class="d-flex gap-2">
{% for name, url in pub.links.items() %}
<a class="link icon-link icon-link-hover paper-link link-secondary" href="{{ url }}" target="_blank" rel="noopener noreferrer">{{ name }}</a>
{% endfor %}
</div>
</div>
<h5 class="mb-1 paper-title">{{ pub.title|safe }}</h5>
<p class="card-text mb-auto author-name">{{ pub.authors|safe }}</p>
</div>

View file

@ -0,0 +1,18 @@
{% extends 'base.html' %}
{% block title %}Yan Lin's Presentations{% endblock %}
{% block header_title %}Yan Lin's Presentations{% endblock %}
{% block navigation %}
{% endblock %}
{% block content %}
<article class="section mt-4">
<div class="list-group list-group-flush">
{% for presentation in data.presentations %}
{% include 'partials/presentation.html' %}
{% endfor %}
</div>
</article>
{% endblock %}

31
templates/projects.html Normal file
View file

@ -0,0 +1,31 @@
{% extends 'base.html' %}
{% block title %}Yan Lin's Projects{% endblock %}
{% block header_title %}Yan Lin's Projects{% endblock %}
{% block navigation %}
<!-- {% include 'partials/navigation.html' %} -->
{% endblock %}
{% block content %}
<article class="section mt-4">
<h2 class="section-title mb-3"><i class="bi bi-star-fill"></i> Primary Projects</h2>
<div class="list-group list-group-flush mb-4">
{% for project in data.primaryProjects %}
{% with type='primary' %}
{% include 'partials/project.html' %}
{% endwith %}
{% endfor %}
</div>
<h2 class="section-title mb-3 mt-4"><i class="bi bi-star"></i> Secondary Projects</h2>
<div class="list-group list-group-flush">
{% for project in data.secondaryProjects %}
{% with type='secondary' %}
{% include 'partials/project.html' %}
{% endwith %}
{% endfor %}
</div>
</article>
{% endblock %}

View file

@ -0,0 +1,35 @@
{% extends 'base.html' %}
{% block title %}Yan Lin's Publications{% endblock %}
{% block header_title %}Yan Lin's Publications{% endblock %}
{% block navigation %}
<!-- {% include 'partials/navigation.html' %} -->
{% endblock %}
{% block content %}
<article class="section mt-4">
<h2 class="section-title mb-3"><i class="bi bi-star-fill"></i> Primary Publications</h2>
<div class="list-group list-group-flush mb-4">
{% for pub in data.primaryPublications %}
{% with type='primary' %}
{% include 'partials/publication.html' %}
{% endwith %}
{% endfor %}
</div>
<h2 class="section-title mb-3 mt-4"><i class="bi bi-star"></i> Secondary Publications</h2>
<div class="list-group list-group-flush">
{% for pub in data.secondaryPublications %}
{% with type='secondary' %}
{% include 'partials/publication.html' %}
{% endwith %}
{% endfor %}
</div>
<div class="text-start mt-3">
<small class="text-muted" style="font-size: 0.8rem;">* Equal Contribution</small>
</div>
</article>
{% endblock %}