diff --git a/data.yaml b/data.yaml index ee6c0fe..2fc1b52 100644 --- a/data.yaml +++ b/data.yaml @@ -5,6 +5,7 @@ primaryPublications: - "IEEE TKDE" - "2025" links: + Paper: "https://ieeexplore.ieee.org/document/11004614" Preprint: "https://arxiv.org/abs/2402.07232" Code: "https://github.com/Logan-Lin/UVTM" @@ -115,7 +116,7 @@ secondaryPublications: authors: "Letian Gong, Shengnan Guo, Yan Lin, Yichen Liu, Erwen Zheng, Yiwei Shuang, Youfang Lin, Jilin Hu, Huaiyu Wan" tags: - "IEEE TKDE" - - "2024" + - "2024" links: Paper: "https://ieeexplore.ieee.org/document/10836764" @@ -181,13 +182,13 @@ secondaryPublications: Code: "https://github.com/Water2sea/WITRAN" primaryProjects: - - title: 'Research on Prediction of User Travel Destination and Travel Time Based on Trajectory Representation Learning' + - title: "Research on Prediction of User Travel Destination and Travel Time Based on Trajectory Representation Learning" tags: - "Fundamental Research Funds for the Central Universities of China" desc: "Applies representation learning to trajectory data to transform original features into high-level information, improving the performance of downstream tasks such as travel time and destination prediction." links: {} - - title: 'Development of OverleafCopilot - Empowering Academic Writing in Overleaf with Large Language Models' + - title: "Development of OverleafCopilot - Empowering Academic Writing in Overleaf with Large Language Models" tags: - "Personal Interest Project" desc: "This project aims to develop a Browser extension to seamlessly integrate Large Language Models (such as ChatGPT) into the popular online academic writing platform, Overleaf." @@ -195,7 +196,7 @@ primaryProjects: Home: "https://www.overleafcopilot.com/" Install: "https://chromewebstore.google.com/detail/overleaf-copilot/eoadabdpninlhkkbhngoddfjianhlghb" - - title: 'Development of PromptGenius - All-purpose prompts for LLMs' + - title: "Development of PromptGenius - All-purpose prompts for LLMs" tags: - "Personal Interest Project" desc: "This project focuses on developing a website that offers a wide range of prompt categories, enhancing the versatility of LLMs for various tasks and improving their output quality." @@ -204,33 +205,33 @@ primaryProjects: Code: "https://github.com/wenhaomin/ChatGPT-PromptGenius" secondaryProjects: - - title: 'Research on Inverse Design of Materials Using Diffusion Probabilistic Models' + - title: "Research on Inverse Design of Materials Using Diffusion Probabilistic Models" tags: - "Villum Foundation" desc: "This project focuses on developing diffusion probabilistic models to first understand the relationship between chemistry/structure and material properties, then enable the inverse design of new materials with specific properties. This project currently supports my postdoctoral research position." links: {} - - title: 'Research on Pre-training Representation Learning Methods of Spatial-temporal Trajectory Data for Traffic Prediction' + - title: "Research on Pre-training Representation Learning Methods of Spatial-temporal Trajectory Data for Traffic Prediction" tags: - "National Natural Science Foundation of China" desc: "This project aims to propose pre-training representation learning methods for spatial-temporal trajectory data, modeling multiple features to improve traffic prediction tasks. It demonstrates how trajectory representation learning can enhance traffic data mining." links: {} - - title: 'Research on Spatial-temporal Trajectory Generation and Representation Learning Methods for Sparsity Problems' + - title: "Research on Spatial-temporal Trajectory Generation and Representation Learning Methods for Sparsity Problems" tags: - "National Natural Science Foundation of China" desc: "This project explores how to generate high-quality spatial-temporal trajectory data and corresponding representations to address sparsity-related issues, thereby supporting a variety of downstream tasks." links: {} presentations: - - title: 'Self-supervised Learning of Trajectory Data' + - title: "Self-supervised Learning of Trajectory Data" tags: - "Guest lecture" - "Aalborg University" links: Slides: "/assets/Self-supervised Learning of Trajectory Data.pdf" - - title: 'PLM4Traj: Leveraging Pre-trained Language Models for Cognizing Movement Patterns and Travel Purposes from Trajectories' + - title: "PLM4Traj: Leveraging Pre-trained Language Models for Cognizing Movement Patterns and Travel Purposes from Trajectories" tags: - "Workshop presentation" - "KDD 2024" @@ -238,21 +239,21 @@ presentations: Slides: "/assets/KDD_2024_Workshop_PLM4Traj.pdf" Paper: "https://arxiv.org/abs/2405.12459" - - title: 'Origin-Destination Travel Time Oracle for Map-based Services' + - title: "Origin-Destination Travel Time Oracle for Map-based Services" tags: - "Paper Oral" - "SIGMOD 2024" links: Slides: "/assets/SIGMOD-Oral-PPT.pdf" - - title: 'Self-supervised Learning of Spatial-temporal Trajectories' + - title: "Self-supervised Learning of Spatial-temporal Trajectories" tags: - "Tutorial" - "SpatialDI 2024" links: Slides: "/assets/Talk on SpatialDI 2024.pdf" - - title: 'Pre-training Context and Time Aware Location Embeddings from Spatial-Temporal Trajectories for User Next Location Prediction' + - title: "Pre-training Context and Time Aware Location Embeddings from Spatial-Temporal Trajectories for User Next Location Prediction" tags: - "Paper Oral" - "AAAI 2021" @@ -264,14 +265,3 @@ services: - "Secretary of IEEE (Denmark Section) Computer Society" - "Reviewer for journals including TIST, TII, and TVT" - "Member of program committees of KDD, ICLR, NeurIPS, AAAI, CVPR, ICCV, IJCAI, and WWW" - -blogs: - - title: "One Step Diffusion Models" - badge: "May 2025" - path: "one-step-diffusion-models" - tldr: "Despite the promising performance of diffusion models on continuous modality generation, one deficiency that is holding them back is their requirement for multi-step denoising processes, which can be computationally expensive. In this article, we examine recent works that aim to build diffusion models capable of performing sampling in one or a few steps." - - - title: "Multi-modal and Multi-function Transformers" - badge: "April 2025" - path: "multi-modal-transformer" - tldr: "Multi-modal and multi-function Transformers enables a single architecture to process diverse data types such as language, images, and videos simultaneously. These models employ techniques like vector quantization and lookup-free quantization to map different modalities into a unified embedding space, allowing the Transformer to handle them within the same sequence. Beyond processing multiple data types, these architectures can also combine different functionalities-such as auto-regressive language generation and diffusion-based image creation-within a single model." \ No newline at end of file diff --git a/dist/blog/html/multi-modal-transformer.html b/dist/blog/html/multi-modal-transformer.html deleted file mode 100644 index 32cc058..0000000 --- a/dist/blog/html/multi-modal-transformer.html +++ /dev/null @@ -1,245 +0,0 @@ - - - - - - - Yan Lin's Blog - Multi-modal and Multi-function Transformers - - - - - - - - - - -
-
-
-
- -
-
- -
-
- -
-
-
-
- -
-
-

Multi-modal and Multi-function Transformers

-

Transformers have gained immense popularity within deep learning and AI communities in recent years. Since their introduction in Vaswani et al., "Attention Is All You Need", they have proven to be powerful sequential models across diverse domains, with thousands of variations and "improved versions." The rise of Large Language Models (LLMs), which largely use Transformers as their foundation, has led to another surge in research around this architecture. This trend has even led graph learning and Computer Vision (CV) communities to move beyond their established foundation models (i.e., GNNs and CNNs) and embrace Transformers. This explains the increasing prevalence of graph Transformers and image Transformers today.

-
-

Han et al., “A Survey on Vision Transformer”; Khan et al., “Transformers in Vision”; Yun et al., “Graph Transformer Networks.”

-
-

Beyond "chasing the trend," using Transformer as a unified foundation model offers several advantages:

- -

In this article, we will briefly explore techniques for unifying multiple modalities (e.g., natural language and images) and multiple functionalities (e.g., language models and diffusion denoisers) within a single Transformer. These techniques are largely sourced from recent oral papers presented at ICML, ICLR, and CVPR conferences. I assume readers have general knowledge of basic concepts in ML and neural networks, Transformers, LLMs, and diffusion models.

-

Since images and language modalities represent continuous and discrete data respectively, we will use them as examples throughout this article. Keep in mind that the techniques introduced can be readily extended to other modalities, including spatiotemporal data.

-

General Goal

-

The goal of a multi-modal Transformer is to create a model that can accept multi-modal inputs and produce multi-modal outputs. For example, instead of using a CNN-based image encoder and a Transformer-based language encoder to map image and language modalities to the latent space separately, a multi-modal Transformer would be able to process the combination of image and language (sentence) as a single sequence.

-
- image -
An example of “conventional” multi-modal fusion. Different modality is processed by separate models and fused at some point. Source: Xiang, Hao, Runsheng Xu, and Jiaqi Ma. "HM-ViT: Hetero-modal vehicle-to-vehicle cooperative perception with vision transformer." CVPR, 2023.
-
-
- image (1) -
An example of a Transformer that can handle multi-modal inputs and outputs. Different modalities are all projected into tokens and subsequently processed by a unified Transformer encoder. Source: Kondratyuk, Dan, Lijun Yu, et al. “VideoPoet: A Large Language Model for Zero-Shot Video Generation,” ICML, 2024.
-
-

Beyond multi-modal processing, a multi-function Transformer can, for example, function as both a language model (auto-regressive generation) and diffusion denoiser (score-matching generation) simultaneously, supporting two of the most common generation schemes used today.

-

Modality Embedding

-

A fundamental challenge in unifying multiple modalities within a single Transformer is how to represent different modalities in the same embedding space. For the "QKV" self-attention mechanism to work properly, each item in the input sequence must be represented by an embedding vector of the same dimension, matching the "model dimension" of the Transformer.

-
- image (2) -
Illustration of the QKV self-attention mechanism in Transformer. Source
-
-

The most common method for mapping language into the embedding space is through tokenization and token embedding. A tokenizer maps a word or word fragment into a discrete token index, and an index-fetching embedding layer (implemented in frameworks like PyTorch with nn.Embedding) maps this index into a fixed-dimension embedding vector. In principle, all discrete features can be mapped into the embedding space using this approach.

-
- 1_Dk1X5rmLomXqqTPeuHgBpw -
Visualization of tokenizer and index-fetching embedding layer. Source
-
-

Vector Quantization

-

For continuous features, one intuitive approach is to first tokenize them into discrete tokens, thereby unifying the embedding process across both discrete and continuous features. Vector quantization, introduced in VQ-VAE, is one of the most common methods for this purpose.

-
-

Van Den Oord, Aaron, and Oriol Vinyals. "Neural discrete representation learning." NeurIPS, 2017.

-
-

Vector quantization maintains a "codebook" , which functions similarly to the index-fetching embedding layer, where is the total number of unique tokens, and is the embedding size. A given continuous vector is quantized into a discrete value by finding the closest row vector in to , and that row vector is fetched as the embedding for . Formally: - -Screen_Shot_2020-06-28_at_4.26.40_PM

-

Lookup-Free Quantization

-

A significant limitation of vector quantization is that it requires calculating distances between the given continuous vectors and the entire codebook, which becomes computationally expensive for large-scale codebooks. This creates tension with the need for expanded codebooks to represent complex modalities such as images and videos. Research has shown that simply increasing the number of unique tokens doesn't always improve codebook performance.

-
-

“A simple trick for training a larger codebook involves decreasing the code embedding dimension when increasing the vocabulary size.” Source: Yu, Lijun, Jose Lezama, et al. “Language Model Beats Diffusion - Tokenizer Is Key to Visual Generation,” ICLR, 2024.

-
-

Building on this insight, Lookup-Free Quantization (LFQ) eliminates the embedding dimension of codebooks (essentially reducing the embedding dimension to 0) and directly calculates the discrete index by individually quantizing each dimension of into a binary digit. The index can then be computed by converting the binary representation to decimal. Formally: - -

-
-

For example, given a continuous vector , we first quantize each dimension into , based on the sign of each dimension. The token index of is simply the decimal equivalent of the binary 0110, which is 6.

-
-

However, this approach introduces another challenge: we still need an index-fetching embedding layer to map these token indices into embedding vectors for the Transformer. This, combined with the typically large number of unique tokens when using LFQ—a 32-dimensional will result in unique tokens—creates significant efficiency problems. One solution is to factorize the token space. Effectively, this means splitting the binary digits into multiple parts, embedding each part separately, and concatenating the resulting embedding vectors. For example, with a 32-dimensional , if we quantize and embed its first and last 16 dimensions separately, we “only” need to handle unique tokens.

-

Note that this section doesn't extensively explain how to map raw continuous features into the vector , as these techniques are relatively straightforward and depend on the specific feature type—for example, fully-connected layers for numerical features, or CNN/GNN with feature flattening for structured data.

-

Quantization over Linear Projection

-

You might be asking—why can't we simply use linear projections to map the raw continuous features into the embedding space? What are the benefits of quantizing continuous features into discrete tokens?

-

Although Transformers are regarded as universal sequential models, they were designed for discrete tokens in their first introduction in Vaswani et al., "Attention Is All You Need". Empirically, they have optimal performance when dealing with tokens, compared to continuous features. This is supported by many research papers claiming that quantizing continuous features improves the performance of Transformers, and works demonstrating Transformers' subpar performance when applied directly to continuous features.

-
-

Mao, Chengzhi, Lu Jiang, Mostafa Dehghani, Carl Vondrick, Rahul Sukthankar, and Irfan Essa. “Discrete Representations Strengthen Vision Transformer Robustness,” ICLR, 2022.

-

Ilbert, Romain, Ambroise Odonnat, et al. “SAMformer: Unlocking the Potential of Transformers in Time Series Forecasting with Sharpness-Aware Minimization and Channel-Wise Attention,” ICML, 2024.

-
-

On the other hand, unifying different modalities into tokens is especially beneficial in the context of Transformer-based "foundation models," since it preserves the auto-regressive next-token prediction architecture of LLMs. Combined with special tokens such as "start of sentence" and "end of sentence," the Transformer model is flexible in generating contents of mixed modalities with varied length.

-
-

For example, by quantizing videos into discrete tokens and combining the token space of videos and language, one can create a unified Transformer model that generates both videos and language in one sequence. The start and end points of video and language sub-sequences are fully determined by the model, based on the specific input prompt. This structure would be difficult to replicate if we used tokenization for language but linear projection for videos.

-
-

Transformer Backbone

-

After different modalities are mapped into the same embedding space, they can be arranged into a sequence of embedding vectors and input into a Transformer backbone. We don't discuss the variations of Transformer structure and improvement techniques here, as they are numerous, and ultimately function similarly as sequential models.

-
-

Lan et al., “ALBERT”; Ye et al., “Differential Transformer”; Kitaev, Kaiser, and Levskaya, “Reformer”; Su et al., “RoFormer”; Dai et al., “Transformer-XL.”

-
-

As we know, the "full" Transformer structure proposed in Vaswani et al., "Attention Is All You Need" includes an encoder and a decoder. They perform self-attention within their respective input sequences, and the decoder additionally performs cross-attention between its input sequence and the memory sequence derived from the encoder's output. Some early language models use encoder-only structure (like Devlin et al., "BERT") focused on outputting embedding vectors or encoder-decoder structure (like Chung et al., "Scaling Instruction-Finetuned Language Models") for generating natural language output. Most modern large language models and foundation models use decoder-only structure (like Brown et al., "Language Models Are Few-Shot Learners"), focusing on auto-regressive generation of language output.

-

The encoder-only structure theoretically excels at representation learning, and its produced embedding vectors can be applied to various downstream tasks. Recent developments have gradually moved towards decoder-only structure, centered around the idea of building models that are capable of directly generating the required final output of every downstream task.

-
-

For example, to perform sentiment analysis, BERT will compute an embedding vector for the query sentence, and the embedding vector can be used in a dedicated classifier to predict the sentiment label. GPT, on the other hand, can directly answer the question "what is the sentiment associated with the query sentence?" Comparatively, GPT is more versatile in most cases and can easily perform zero-shot prediction.

-
-

Nevertheless, representation learning is still a relevant topic. The general understanding is that decoder-only structure cannot perform conventional representation learning, for example mapping a sentence into a fixed-dimension embedding vector. Yet, there are a few works in the latest ICLR that shed light on the utilization of LLMs as representation learning or embedding models:

-
-

Gao, Leo, Tom Dupre la Tour, Henk Tillman, Gabriel Goh, Rajan Troll, Alec Radford, Ilya Sutskever, Jan Leike, and Jeffrey Wu. “Scaling and Evaluating Sparse Autoencoders,” 2024. Link

-

Li, Ziyue, and Tianyi Zhou. “Your Mixture-of-Experts LLM Is Secretly an Embedding Model for Free,” 2024. Link

-

Zhang, Jie, Dongrui Liu, Chen Qian, Linfeng Zhang, Yong Liu, Yu Qiao, and Jing Shao. “REEF: Representation Encoding Fingerprints for Large Language Models,” 2024. Link

-
-

Output Layer

-

For language generation, Transformers typically use classifier output layers, mapping the latent vector of each item in the output sequence back to tokens. As we've established in the "modality embedding" section, the optimal method to embed continuous features is to quantize them into discrete tokens. Correspondingly, an intuitive method to output continuous features is to map these discrete tokens back to the continuous feature space, essentially reversing the vector quantization process.

-

Reverse Vector Quantization

-

One approach to reverse vector quantization is readily available in VQ-VAE, since it is an auto-encoder. Given a token , we can look up its embedding in the codebook as , then apply a decoder network to map back to the continuous feature vector . The decoder network can be pre-trained in the VQ-VAE framework—pre-train the VQ-VAE tokenizer, encoder, and decoder using auto-encoding loss functions, or end-to-end trained along with the whole Transformer. In the NLP and CV communities, the pre-training approach is more popular, since there are many large-scale pre-trained auto-encoders available.

-
- image (4) -
The encoder-decoder structure of MAGVIT (Yu et al., “MAGVIT”), a visual VQ-VAE model. A 3D-VQ encoder quantizes a video into discrete tokens, and a 3D-VQ decoder maps them back to the pixel space.
-
-

Efficiency Enhancement

-

For continuous feature generation, unlike language generation where the output tokens are the final output, we are essentially representing the final output with a limited size token space. Thus, for complicated continuous features like images and videos, we have to expand the token space or use more tokens to represent one image or one video frame to improve generation quality, which can result in efficiency challenges.

-

There are several workarounds to improve the efficiency of multi-modal outputs. One approach is to generate low-resolution outputs first, then use a separate super-resolution module to improve the quality of the output. This approach is explored in Kondratyuk et al., "VideoPoet" and Tian et al., "Visual Autoregressive Modeling". Interestingly, the overall idea is very similar to nVidia's DLSS, where the graphics card renders a low-resolution frame (e.g., 1080p) using the conventional rasterization pipeline, then a super resolution model increases the frame's resolution (e.g., 4k) utilizing the graphics card's tensor hardware, improving games' overall frame rate.

-

Another workaround follows the idea of compression. Take video generation as an example. The model generates full features for key frames, and light-weight features for motion vectors that describe subtle differences from those key frames. This is essentially how inter-frame compressed video codecs work, which takes advantage of temporal redundancy between neighboring frames.

-
- image (5) -
Keys frames and motion vectors used in Jin et al., “Video-LaVIT.”
-
-

Fuse with Diffusion Models

-

Despite continuous efforts to enable representation and generation of images and videos with a language model structure (auto-regressive), current research indicates that diffusion models (more broadly speaking, score-matching generative models) outperform language models on continuous feature generation. Score-matching generative models have their own separate and substantial community, with strong theoretical foundations and numerous variations emerging each year, such as stochastic differential equations, bayesian flow, and rectified flow. In conclusion, score-matching generative models are clearly here to stay alongside language models.

-

An intriguing question arises: why not integrate the structures of language models and diffusion models into one Transformer to reach the best of both worlds? Zhou et al. in "Transfusion" explored this idea. The approach is straightforward: build a Transformer that can handle both language and image inputs and outputs. The language component functions as a language model, while the image component serves as a denoiser network for diffusion models. The model is trained by combining the language modeling loss and DDPM loss, enabling it to function either as a language model or a text-to-image denoiser.

-
- image (6) -
A Transformer capable of function as a language model and a diffusion denoiser at the same time. Source: Zhou, Chunting, Lili Yu, et al. “Transfusion: Predict the Next Token and Diffuse Images with One Multi-Modal Model,” ICLR, 2025.
-
-

Conclusion

-

In conclusion, the evolution of Transformers into versatile foundation models capable of handling multiple modalities and functionalities represents a significant advancement in AI research. By enabling a single architecture to process diverse data types through techniques like vector quantization and lookup-free quantization, researchers have created models that can seamlessly integrate language, images, and other modalities within the same embedding space.

-

In our research domain, we encounter even more diverse and domain-specific multi-modal data, such as traffic flows, trajectories, and real-world agent interactions. A unified Transformer for such data presents a promising solution for creating "foundation models" that generalize across diverse tasks and scenarios. However, domain-specific challenges, including data encoding and decoding, computational efficiency, and scalability, must be addressed to realize this potential.

-
-

Copyright © 2025. Designed and implemented by Yan Lin.

-
- - - - - \ No newline at end of file diff --git a/dist/blog/html/one-step-diffusion-models.html b/dist/blog/html/one-step-diffusion-models.html deleted file mode 100644 index ffa8cfb..0000000 --- a/dist/blog/html/one-step-diffusion-models.html +++ /dev/null @@ -1,241 +0,0 @@ - - - - - - - Yan Lin's Blog - One Step Diffusion Models - - - - - - - - - - -
-
-
-
- -
-
- -
-
- -
-
-
-
- -
-
-

One Step Diffusion Models

-

Despite the promising performance of diffusion models on continuous modality generation, one deficiency that is holding them back is their requirement for multi-step denoising processes, which can be computationally expensive. In this article, we examine recent works that aim to build diffusion models capable of performing sampling in one or a few steps.

-
-

Background

-

Diffusion models (DMs), or more broadly speaking, score-matching generative models, have become the de facto framework for building deep generation models. They demonstrate exceptional generation performance, especially on continuous modalities including images, videos, audios, and spatiotemporal data.

-

Most diffusion models work by coupling a forward diffusion process and a reverse denoising diffusion process. The forward diffusion process gradually adds noise to the ground truth clean data , until noisy data that follows a relatively simple distribution is reached. The reverse denoising diffusion process starts from the noisy data , and removes the noise component step-by-step until clean generated data is reached. The reverse process is essentially a Monte-Carlo process, meaning it cannot be parallelized for each generation, which can be inefficient for a process with a large number of steps.

-
- image-20250503125941212 -
The two processes in a typical diffusion model. Source: Ho, Jain, and Abbeel, “Denoising Diffusion Probabilistic Models.”
-
-

Understanding DMs

-

There are many ways to understand how Diffusion Models (DMs) work. One of the most common and intuitive approaches is that a DM learns an ordinary differential equation (ODE) that transforms noise into data. Imagine an ODE vector field between the noise and clean data . By training on sufficiently large numbers of timesteps , a DM is able to learn the vector (tangent) towards the cleaner data , given any specific timestep and the corresponding noisy data . This idea is easy to illustrate in a simplified 1-dimensional data scenario.

-
- image-20250503132738122 -
Illustrated ODE flow of a diffusion model on 1-dimensional data. Source: Song et al., “Score-Based Generative Modeling through Stochastic Differential Equations.” It should be noted that as the figure suggests, there are differences between ODEs and DMs in a narrow sense. Flow matching models, a variant of DMs, more closely resemble ODEs.
-
-

DMs Scale Poorly with Few Steps

-

Vanilla DDPM, which is essentially a discrete-timestep DM, can only perform the reverse process using the same number of steps it is trained on, typically thousands. DDIM introduces a reparameterization scheme that enables skipping steps during the reverse process of DDPM. Continuous-timestep DMs like Stochastic Differential Equations (SDE) naturally possess the capability of using fewer steps in the reverse process compared to the forward process/training.

-
-

Ho, Jain, and Abbeel, “Denoising Diffusion Probabilistic Models.” -Song, Meng, and Ermon, “Denoising Diffusion Implicit Models.” -Song et al., “Score-Based Generative Modeling through Stochastic Differential Equations.”

-
-

Nevertheless, it is observed that their performance typically suffers catastrophic degradation when reducing the number of reverse process steps to single digits.

-
- image-20250503135351246 -
Images generated by conventional DMs with only a few steps of reverse process. Source: Frans et al., “One Step Diffusion via Shortcut Models.”
-
-

To understand why DMs scale poorly with few reverse process steps, we can return to the ODE vector field perspective of DMs. When the target data distribution is complex, the vector field typically contains numerous intersections. When a given and is at these intersections, the vector points to the averaged direction of all candidates. This causes the generated data to approach the mean of the training data when only a few reverse process steps are used. Another explanation is that the learned vector field is highly curved. Using only a few reverse process steps means attempting to approximate these curves with polylines, which is inherently difficult.

-
- image-20250503141422791 -
Illustration of the why DMs scale poorly with few reverse process steps. Source: Frans et al., “One Step Diffusion via Shortcut Models.”
-
-

We will introduce two branches of methods that aim to scale DMs to few or even reverse process steps: distillation-based, which distillates a pre-trained DM into a one-step model; and end-to-end-based, which trains a one-step DM from scratch.

-

Distallation

-

Distillation-based methods are also called rectified flow methods. Their idea follows the above insight of "curved ODE vector field": if the curved vectors (flows) are hindering the scaling of reverse process steps, can we try to straighten these vectors so that they are easy to approximate with polylines or even straight lines?

-

Liu, Gong, and Liu, "Flow Straight and Fast" implements this idea, focusing on learning an ODE that follows straight vectors as much as possible. In the context of continuous time DMs where and and , suppose the clean data and noise each follows a data distribution, and . The "straight vectors" can be achieved by solving a nonlinear least squares optimization problem: - -

-

- -

-

Where is the vector field of the ODE .

-

Though straightforward, when the clean data distribution is very complicated, the ideal result of completely straight vectors can be hard to achieve. To address this, a "reflow" procedure is introduced. This procedure iteratively trains new rectified flows using data generated by previously obtained flows: - -This procedure produces increasingly straight flows that can be simulated with very few steps, ideally one step after several iterations.

-
- image-20250504142749208 -
Illustrations of vector fields after different times of reflow processes. Source: Liu, Gong, and Liu, “Flow Straight and Fast.”
-
-

In practice, distillation-based methods are usually trained in two stages: first train a normal DM, and later distill one-step capabilities into it. This introduces additional computational overhead and complexity.

-

End-to-end

-

Compared to distillation-based methods, end-to-end-based methods train a one-step-capable diffusion model (DM) within a single training run. Various techniques are used to implement such methods. We will focus on two of them: consistency models and shortcut models.

-

Consistency Models

-

In discrete-timestep diffusion models (DMs), three components in the reverse denoising diffusion process are interchangeable through reparameterization: the noise component to remove, the less noisy previous step , and the predicted clean sample . This interchangeability is enabled by the following equation: - -In theory, without altering the fundamental formulation of DMs, the learnable denoiser network can be designed to predict any of these three components. Consistency models (CMs) follow this principle by training the denoiser to specifically predict the clean sample . The benefit of this approach is that CMs can naturally scale to perform the reverse process with few steps or even a single step.

-
- image-20250504161430743 -
A consistency model that learns to map any point on the ODE trajectory to the clean sample. Source: Song et al., “Consistency Models.”
-
-

Formally, CMs learn a function that maps noisy data at time directly to the clean data , satisfying: - -The model must also obey the differential consistency condition: - -CMs are trained by minimizing the discrepancy between outputs at adjacent times, with the loss function: - -Similar to continuous-timestep DMs and discrete-timestep DMs, CMs also have continuous-time and discrete-time variants. Discrete-time CMs are easier to train, but are more sensitive to timestep scheduling and suffer from discretization errors. Continuous-time CMs, on the other hand, suffer from instability during training.

-

For a deeper discussion of the differences between the two variants of CMs, and how to stabilize continuous-time CMs, please refer to Lu and Song, "Simplifying, Stabilizing and Scaling Continuous-Time Consistency Models."

-

Shortcut Models

-

Similar to distillation-based methods, the core idea of shortcut models is inspired by the "curved vector field" problem, but the shortcut models take a different approach to solve it.

-

Shortcut models are introduced in Frans et al., "One Step Diffusion via Shortcut Models." The paper presents the insight that conventional DMs perform badly when jumping with large step sizes stems from their lack of awareness of the step size they are set to jump forward. Since they are only trained to comply with small step sizes, they are only learning the tangents in the curved vector field, not the "correct direction" when a large step size is used.

-

Based on this insight, on top of and , shortcut models additionally include step size as part of the condition for the denoiser network. At small step sizes (), the model behaves like a standard flow-matching model, learning the expected tangent from noise to data. For larger step sizes, the model learns that one large step should equal two consecutive smaller steps (self-consistency), creating a binary recursive formulation. The model is trained by combining the standard flow matching loss when and the self-consistency loss when : - -

-

- -

-

- -

-

- -

-
- image-20250504180714955 -
Illustration of the training process of shortcut models. Source: Frans et al., “One Step Diffusion via Shortcut Models.”
-
-

Both consistency models and shortcut models can be seamlessly scaled between one-step and multi-step generation to balance quality and efficiency.

-
-

Copyright © 2025. Designed and implemented by Yan Lin.

-
- - - - - \ No newline at end of file diff --git a/dist/blog/index.html b/dist/blog/index.html deleted file mode 100644 index f7395b9..0000000 --- a/dist/blog/index.html +++ /dev/null @@ -1,105 +0,0 @@ - - - - - - - Yan Lin's Blog - - - - - - - - -
- -
- -
-
- - - - - -
-
- -
-
- - - -
-
- - -
- - -
-
- -
- - One Step Diffusion Models - May 2025 -

Despite the promising performance of diffusion models on continuous modality generation, one deficiency that is holding them back is their requirement for multi-step denoising processes, which can be computationally expensive. In this article, we examine recent works that aim to build diffusion models capable of performing sampling in one or a few steps.

-
- -
- - Multi-modal and Multi-function Transformers - April 2025 -

Multi-modal and multi-function Transformers enables a single architecture to process diverse data types such as language, images, and videos simultaneously. These models employ techniques like vector quantization and lookup-free quantization to map different modalities into a unified embedding space, allowing the Transformer to handle them within the same sequence. Beyond processing multiple data types, these architectures can also combine different functionalities-such as auto-regressive language generation and diffusion-based image creation-within a single model.

-
- -
-
- -
- - - - - - - - - - - \ No newline at end of file diff --git a/dist/blog/md/multi-modal-transformer.assets/1_Dk1X5rmLomXqqTPeuHgBpw.png b/dist/blog/md/multi-modal-transformer.assets/1_Dk1X5rmLomXqqTPeuHgBpw.png deleted file mode 100644 index 9315270..0000000 Binary files a/dist/blog/md/multi-modal-transformer.assets/1_Dk1X5rmLomXqqTPeuHgBpw.png and /dev/null differ diff --git a/dist/blog/md/multi-modal-transformer.assets/Screen_Shot_2020-06-28_at_4.26.40_PM.png b/dist/blog/md/multi-modal-transformer.assets/Screen_Shot_2020-06-28_at_4.26.40_PM.png deleted file mode 100644 index 0e41599..0000000 Binary files a/dist/blog/md/multi-modal-transformer.assets/Screen_Shot_2020-06-28_at_4.26.40_PM.png and /dev/null differ diff --git a/dist/blog/md/multi-modal-transformer.assets/image (1).png b/dist/blog/md/multi-modal-transformer.assets/image (1).png deleted file mode 100644 index 11d9b92..0000000 Binary files a/dist/blog/md/multi-modal-transformer.assets/image (1).png and /dev/null differ diff --git a/dist/blog/md/multi-modal-transformer.assets/image (2).png b/dist/blog/md/multi-modal-transformer.assets/image (2).png deleted file mode 100644 index a7ef23b..0000000 Binary files a/dist/blog/md/multi-modal-transformer.assets/image (2).png and /dev/null differ diff --git a/dist/blog/md/multi-modal-transformer.assets/image (3).png b/dist/blog/md/multi-modal-transformer.assets/image (3).png deleted file mode 100644 index d28e865..0000000 Binary files a/dist/blog/md/multi-modal-transformer.assets/image (3).png and /dev/null differ diff --git a/dist/blog/md/multi-modal-transformer.assets/image (4).png b/dist/blog/md/multi-modal-transformer.assets/image (4).png deleted file mode 100644 index 9e97efb..0000000 Binary files a/dist/blog/md/multi-modal-transformer.assets/image (4).png and /dev/null differ diff --git a/dist/blog/md/multi-modal-transformer.assets/image (5).png b/dist/blog/md/multi-modal-transformer.assets/image (5).png deleted file mode 100644 index e1c27c4..0000000 Binary files a/dist/blog/md/multi-modal-transformer.assets/image (5).png and /dev/null differ diff --git a/dist/blog/md/multi-modal-transformer.assets/image (6).png b/dist/blog/md/multi-modal-transformer.assets/image (6).png deleted file mode 100644 index 47ff387..0000000 Binary files a/dist/blog/md/multi-modal-transformer.assets/image (6).png and /dev/null differ diff --git a/dist/blog/md/multi-modal-transformer.assets/image.png b/dist/blog/md/multi-modal-transformer.assets/image.png deleted file mode 100644 index 7db5437..0000000 Binary files a/dist/blog/md/multi-modal-transformer.assets/image.png and /dev/null differ diff --git a/dist/blog/md/multi-modal-transformer.md b/dist/blog/md/multi-modal-transformer.md deleted file mode 100644 index 5f9af3a..0000000 --- a/dist/blog/md/multi-modal-transformer.md +++ /dev/null @@ -1,148 +0,0 @@ -# Multi-modal and Multi-function Transformers - -Transformers have gained immense popularity within deep learning and AI communities in recent years. Since their introduction in *Vaswani et al., "Attention Is All You Need"*, they have proven to be powerful sequential models across diverse domains, with thousands of variations and "improved versions." The rise of Large Language Models (LLMs), which largely use Transformers as their foundation, has led to another surge in research around this architecture. This trend has even led graph learning and Computer Vision (CV) communities to move beyond their established foundation models (i.e., GNNs and CNNs) and embrace Transformers. This explains the increasing prevalence of graph Transformers and image Transformers today. - -> Han et al., “A Survey on Vision Transformer”; Khan et al., “Transformers in Vision”; Yun et al., “Graph Transformer Networks.” - -Beyond "chasing the trend," using Transformer as a unified foundation model offers several advantages: - -- Transformers excel at capturing long-term dependencies. Unlike GNNs and CNNs which require deeper network structures for longer context, Transformers natively support global dependency modeling through their self-attention mechanism. They also avoid global smoothing and vanishing gradient problems that hinder context length scaling in other network architectures. -- Transformers process sequences in parallel rather than sequentially, enabling full utilization of GPU acceleration. This advantage can be further enhanced with techniques like those described in *Dao et al., "FlashAttention."* -- Transformers are flexible network structures. They don't inherently enforce sequentiality—without positional encoding, the ordering of input steps to Transformers is equivalent. Through strategic permutation and positional encoding, Transformers can adapt to a wide range of structured and unstructured data. -- The development of LLMs has made many open-weight Transformer models available with strong natural language understanding capabilities. These Transformers can be prompted and fine-tuned to model other modalities such as spatiotemporal data and images while retaining their language modeling abilities, creating opportunities for developing multi-modal foundation models. -- From a practical perspective, using Transformer as a foundation allows reuse of technical infrastructure and optimizations developed over years, including efficient architecture designs, training pipelines, and specialized hardware. - -In this article, we will briefly explore techniques for unifying multiple modalities (e.g., natural language and images) and multiple functionalities (e.g., language models and diffusion denoisers) within a single Transformer. These techniques are largely sourced from recent oral papers presented at ICML, ICLR, and CVPR conferences. I assume readers have general knowledge of basic concepts in ML and neural networks, Transformers, LLMs, and diffusion models. - -Since images and language modalities represent continuous and discrete data respectively, we will use them as examples throughout this article. Keep in mind that the techniques introduced can be readily extended to other modalities, including spatiotemporal data. - -# General Goal - -The goal of a multi-modal Transformer is to create a model that can accept multi-modal inputs and produce multi-modal outputs. For example, instead of using a CNN-based image encoder and a Transformer-based language encoder to map image and language modalities to the latent space separately, a multi-modal Transformer would be able to process the combination of image and language (sentence) as a single sequence. - -![image](multi-modal-transformer.assets/image.png) - -> An example of “conventional” multi-modal fusion. Different modality is processed by separate models and fused at some point. Source: *Xiang, Hao, Runsheng Xu, and Jiaqi Ma. "HM-ViT: Hetero-modal vehicle-to-vehicle cooperative perception with vision transformer." CVPR, 2023.* - -![image (1)](multi-modal-transformer.assets/image (1).png) - -> An example of a Transformer that can handle multi-modal inputs and outputs. Different modalities are all projected into tokens and subsequently processed by a unified Transformer encoder. Source: *Kondratyuk, Dan, Lijun Yu, et al. “VideoPoet: A Large Language Model for Zero-Shot Video Generation,” ICML, 2024.* - -Beyond multi-modal processing, a multi-function Transformer can, for example, function as both a language model (auto-regressive generation) and diffusion denoiser (score-matching generation) simultaneously, supporting two of the most common generation schemes used today. - -# Modality Embedding - -A fundamental challenge in unifying multiple modalities within a single Transformer is how to represent different modalities in the same embedding space. For the "QKV" self-attention mechanism to work properly, each item in the input sequence must be represented by an embedding vector of the same dimension, matching the "model dimension" of the Transformer. - -![image (2)](multi-modal-transformer.assets/image (2).png) - -> Illustration of the QKV self-attention mechanism in Transformer. [Source](https://en.wikipedia.org/wiki/Attention_(machine_learning)) - -The most common method for mapping language into the embedding space is through tokenization and token embedding. A tokenizer maps a word or word fragment into a discrete token index, and an index-fetching embedding layer (implemented in frameworks like PyTorch with `nn.Embedding`) maps this index into a fixed-dimension embedding vector. In principle, all discrete features can be mapped into the embedding space using this approach. - -![1_Dk1X5rmLomXqqTPeuHgBpw](multi-modal-transformer.assets/1_Dk1X5rmLomXqqTPeuHgBpw.png) - -> Visualization of tokenizer and index-fetching embedding layer. [Source](https://medium.com/@hunter-j-phillips/the-embedding-layer-27d9c980d124) - -## Vector Quantization - -For continuous features, one intuitive approach is to first tokenize them into discrete tokens, thereby unifying the embedding process across both discrete and continuous features. **Vector quantization**, introduced in VQ-VAE, is one of the most common methods for this purpose. - -> Van Den Oord, Aaron, and Oriol Vinyals. "Neural discrete representation learning." NeurIPS, 2017. - -Vector quantization maintains a "codebook" $\boldsymbol C \in \mathbb R^{n\times d}$, which functions similarly to the index-fetching embedding layer, where $n$ is the total number of unique tokens, and $d$ is the embedding size. A given continuous vector $\boldsymbol{z}\in\mathbb R^{d}$ is quantized into a discrete value $i\in\mathbb [0,n-1]$ by finding the closest row vector in $\boldsymbol C$ to $\boldsymbol{z}$, and that row vector $\boldsymbol C_i$ is fetched as the embedding for $\boldsymbol{z}$. Formally: -$$ -i = \arg\min_j ||\boldsymbol z - \boldsymbol C_j||₂ -$$ -![Screen_Shot_2020-06-28_at_4.26.40_PM](multi-modal-transformer.assets/Screen_Shot_2020-06-28_at_4.26.40_PM.png) - -## Lookup-Free Quantization - -A significant limitation of vector quantization is that it requires calculating distances between the given continuous vectors and the entire codebook, which becomes computationally expensive for large-scale codebooks. This creates tension with the need for expanded codebooks to represent complex modalities such as images and videos. Research has shown that simply increasing the number of unique tokens doesn't always improve codebook performance. - -> “A simple trick for training a larger codebook involves decreasing the code embedding dimension when increasing the vocabulary size.” Source: *Yu, Lijun, Jose Lezama, et al. “Language Model Beats Diffusion - Tokenizer Is Key to Visual Generation,” ICLR, 2024.* - -Building on this insight, **Lookup-Free Quantization** (LFQ) eliminates the embedding dimension of codebooks (essentially reducing the embedding dimension to 0) and directly calculates the discrete index $i$ by individually quantizing each dimension of $\boldsymbol z$ into a binary digit. The index $i$ can then be computed by converting the binary representation to decimal. Formally: -$$ -i=\sum_{j=1}^{d} 2^{(j-1)}\cdot 𝟙(z_j > 0) -$$ - -> For example, given a continuous vector $\boldsymbol z=\langle -0.52, 1.50, 0.53, -1.32\rangle$, we first quantize each dimension into $\langle 0, 1, 1, 0\rangle$, based on the sign of each dimension. The token index of $\boldsymbol z$ is simply the decimal equivalent of the binary 0110, which is 6. - -However, this approach introduces another challenge: we still need an index-fetching embedding layer to map these token indices into embedding vectors for the Transformer. This, combined with the typically large number of unique tokens when using LFQ—a 32-dimensional $\boldsymbol z$ will result in $2^{32}=4,294,967,296$ unique tokens—creates significant efficiency problems. One solution is to factorize the token space. Effectively, this means splitting the binary digits into multiple parts, embedding each part separately, and concatenating the resulting embedding vectors. For example, with a 32-dimensional $\boldsymbol z$, if we quantize and embed its first and last 16 dimensions separately, we “only” need to handle $2^{16}*2= 131,072$ unique tokens. - -Note that this section doesn't extensively explain how to map raw continuous features into the vector $\boldsymbol{z}$, as these techniques are relatively straightforward and depend on the specific feature type—for example, fully-connected layers for numerical features, or CNN/GNN with feature flattening for structured data. - -## Quantization over Linear Projection - -You might be asking—why can't we simply use linear projections to map the raw continuous features into the embedding space? What are the benefits of quantizing continuous features into discrete tokens? - -Although Transformers are regarded as universal sequential models, they were designed for discrete tokens in their first introduction in *Vaswani et al., "Attention Is All You Need"*. Empirically, they have optimal performance when dealing with tokens, compared to continuous features. This is supported by many research papers claiming that quantizing continuous features improves the performance of Transformers, and works demonstrating Transformers' subpar performance when applied directly to continuous features. - -> Mao, Chengzhi, Lu Jiang, Mostafa Dehghani, Carl Vondrick, Rahul Sukthankar, and Irfan Essa. “Discrete Representations Strengthen Vision Transformer Robustness,” ICLR, 2022. - -> Ilbert, Romain, Ambroise Odonnat, et al. “SAMformer: Unlocking the Potential of Transformers in Time Series Forecasting with Sharpness-Aware Minimization and Channel-Wise Attention,” ICML, 2024. - -On the other hand, unifying different modalities into tokens is especially beneficial in the context of Transformer-based "foundation models," since it preserves the auto-regressive next-token prediction architecture of LLMs. Combined with special tokens such as "start of sentence" and "end of sentence," the Transformer model is flexible in generating contents of mixed modalities with varied length. - -> For example, by quantizing videos into discrete tokens and combining the token space of videos and language, one can create a unified Transformer model that generates both videos and language in one sequence. The start and end points of video and language sub-sequences are fully determined by the model, based on the specific input prompt. This structure would be difficult to replicate if we used tokenization for language but linear projection for videos. - -# Transformer Backbone - -After different modalities are mapped into the same embedding space, they can be arranged into a sequence of embedding vectors and input into a Transformer backbone. We don't discuss the variations of Transformer structure and improvement techniques here, as they are numerous, and ultimately function similarly as sequential models. - -> Lan et al., “ALBERT”; Ye et al., “Differential Transformer”; Kitaev, Kaiser, and Levskaya, “Reformer”; Su et al., “RoFormer”; Dai et al., “Transformer-XL.” - -As we know, the "full" Transformer structure proposed in *Vaswani et al., "Attention Is All You Need"* includes an encoder and a decoder. They perform self-attention within their respective input sequences, and the decoder additionally performs cross-attention between its input sequence and the memory sequence derived from the encoder's output. Some early language models use encoder-only structure (like *Devlin et al., "BERT"*) focused on outputting embedding vectors or encoder-decoder structure (like *Chung et al., "Scaling Instruction-Finetuned Language Models"*) for generating natural language output. Most modern large language models and foundation models use decoder-only structure (like *Brown et al., "Language Models Are Few-Shot Learners"*), focusing on auto-regressive generation of language output. - -The encoder-only structure theoretically excels at representation learning, and its produced embedding vectors can be applied to various downstream tasks. Recent developments have gradually moved towards decoder-only structure, centered around the idea of building models that are capable of directly generating the required final output of every downstream task. - -> For example, to perform sentiment analysis, BERT will compute an embedding vector for the query sentence, and the embedding vector can be used in a dedicated classifier to predict the sentiment label. GPT, on the other hand, can directly answer the question "what is the sentiment associated with the query sentence?" Comparatively, GPT is more versatile in most cases and can easily perform zero-shot prediction. - -Nevertheless, representation learning is still a relevant topic. The general understanding is that decoder-only structure cannot perform conventional representation learning, for example mapping a sentence into a fixed-dimension embedding vector. Yet, there are a few works in the latest ICLR that shed light on the utilization of LLMs as representation learning or embedding models: - -> Gao, Leo, Tom Dupre la Tour, Henk Tillman, Gabriel Goh, Rajan Troll, Alec Radford, Ilya Sutskever, Jan Leike, and Jeffrey Wu. “Scaling and Evaluating Sparse Autoencoders,” 2024. [Link](https://openreview.net/forum?id=tcsZt9ZNKD) - -> Li, Ziyue, and Tianyi Zhou. “Your Mixture-of-Experts LLM Is Secretly an Embedding Model for Free,” 2024. [Link](https://openreview.net/forum?id=eFGQ97z5Cd) - -> Zhang, Jie, Dongrui Liu, Chen Qian, Linfeng Zhang, Yong Liu, Yu Qiao, and Jing Shao. “REEF: Representation Encoding Fingerprints for Large Language Models,” 2024. [Link](https://openreview.net/forum?id=SnDmPkOJ0T) - -# Output Layer - -For language generation, Transformers typically use classifier output layers, mapping the latent vector of each item in the output sequence back to tokens. As we've established in the "modality embedding" section, the optimal method to embed continuous features is to quantize them into discrete tokens. Correspondingly, an intuitive method to output continuous features is to map these discrete tokens back to the continuous feature space, essentially reversing the vector quantization process. - -## Reverse Vector Quantization - -One approach to reverse vector quantization is readily available in VQ-VAE, since it is an auto-encoder. Given a token $i$, we can look up its embedding in the codebook as $\boldsymbol C_i$, then apply a decoder network to map $\boldsymbol C_i$ back to the continuous feature vector $\boldsymbol z$. The decoder network can be pre-trained in the VQ-VAE framework—pre-train the VQ-VAE tokenizer, encoder, and decoder using auto-encoding loss functions, or end-to-end trained along with the whole Transformer. In the NLP and CV communities, the pre-training approach is more popular, since there are many large-scale pre-trained auto-encoders available. - -![image (4)](multi-modal-transformer.assets/image (4).png) - -> The encoder-decoder structure of MAGVIT (*Yu et al., “MAGVIT”*), a visual VQ-VAE model. A 3D-VQ encoder quantizes a video into discrete tokens, and a 3D-VQ decoder maps them back to the pixel space. - -## Efficiency Enhancement - -For continuous feature generation, unlike language generation where the output tokens are the final output, we are essentially representing the final output with a limited size token space. Thus, for complicated continuous features like images and videos, we have to expand the token space or use more tokens to represent one image or one video frame to improve generation quality, which can result in efficiency challenges. - -There are several workarounds to improve the efficiency of multi-modal outputs. One approach is to generate low-resolution outputs first, then use a separate super-resolution module to improve the quality of the output. This approach is explored in *Kondratyuk et al., "VideoPoet"* and *Tian et al., "Visual Autoregressive Modeling"*. Interestingly, the overall idea is very similar to nVidia's DLSS, where the graphics card renders a low-resolution frame (e.g., 1080p) using the conventional rasterization pipeline, then a super resolution model increases the frame's resolution (e.g., 4k) utilizing the graphics card's tensor hardware, improving games' overall frame rate. - -Another workaround follows the idea of compression. Take video generation as an example. The model generates full features for key frames, and light-weight features for motion vectors that describe subtle differences from those key frames. This is essentially how inter-frame compressed video codecs work, which takes advantage of temporal redundancy between neighboring frames. - -![image (5)](multi-modal-transformer.assets/image (5).png) - -> Keys frames and motion vectors used in *Jin et al., “Video-LaVIT.”* - -# Fuse with Diffusion Models - -Despite continuous efforts to enable representation and generation of images and videos with a language model structure (auto-regressive), current research indicates that diffusion models (more broadly speaking, score-matching generative models) outperform language models on continuous feature generation. Score-matching generative models have their own separate and substantial community, with strong theoretical foundations and numerous variations emerging each year, such as stochastic differential equations, bayesian flow, and rectified flow. In conclusion, score-matching generative models are clearly here to stay alongside language models. - -An intriguing question arises: why not integrate the structures of language models and diffusion models into one Transformer to reach the best of both worlds? *Zhou et al. in "Transfusion"* explored this idea. The approach is straightforward: build a Transformer that can handle both language and image inputs and outputs. The language component functions as a language model, while the image component serves as a denoiser network for diffusion models. The model is trained by combining the language modeling loss and DDPM loss, enabling it to function either as a language model or a text-to-image denoiser. - -![image (6)](multi-modal-transformer.assets/image (6).png) - -> A Transformer capable of function as a language model and a diffusion denoiser at the same time. Source: *Zhou, Chunting, Lili Yu, et al. “Transfusion: Predict the Next Token and Diffuse Images with One Multi-Modal Model,” ICLR, 2025.* - -# Conclusion - -In conclusion, the evolution of Transformers into versatile foundation models capable of handling multiple modalities and functionalities represents a significant advancement in AI research. By enabling a single architecture to process diverse data types through techniques like vector quantization and lookup-free quantization, researchers have created models that can seamlessly integrate language, images, and other modalities within the same embedding space. - -In our research domain, we encounter even more diverse and domain-specific multi-modal data, such as traffic flows, trajectories, and real-world agent interactions. A unified Transformer for such data presents a promising solution for creating "foundation models" that generalize across diverse tasks and scenarios. However, domain-specific challenges, including data encoding and decoding, computational efficiency, and scalability, must be addressed to realize this potential. \ No newline at end of file diff --git a/dist/blog/md/one-step-diffusion-models.assets/image-20250503125941212.png b/dist/blog/md/one-step-diffusion-models.assets/image-20250503125941212.png deleted file mode 100644 index 5b261ec..0000000 Binary files a/dist/blog/md/one-step-diffusion-models.assets/image-20250503125941212.png and /dev/null differ diff --git a/dist/blog/md/one-step-diffusion-models.assets/image-20250503132738122.png b/dist/blog/md/one-step-diffusion-models.assets/image-20250503132738122.png deleted file mode 100644 index c3c493c..0000000 Binary files a/dist/blog/md/one-step-diffusion-models.assets/image-20250503132738122.png and /dev/null differ diff --git a/dist/blog/md/one-step-diffusion-models.assets/image-20250503135351246.png b/dist/blog/md/one-step-diffusion-models.assets/image-20250503135351246.png deleted file mode 100644 index f8dd53f..0000000 Binary files a/dist/blog/md/one-step-diffusion-models.assets/image-20250503135351246.png and /dev/null differ diff --git a/dist/blog/md/one-step-diffusion-models.assets/image-20250503141422791.png b/dist/blog/md/one-step-diffusion-models.assets/image-20250503141422791.png deleted file mode 100644 index da6aedf..0000000 Binary files a/dist/blog/md/one-step-diffusion-models.assets/image-20250503141422791.png and /dev/null differ diff --git a/dist/blog/md/one-step-diffusion-models.assets/image-20250504142749208.png b/dist/blog/md/one-step-diffusion-models.assets/image-20250504142749208.png deleted file mode 100644 index 2d3efe8..0000000 Binary files a/dist/blog/md/one-step-diffusion-models.assets/image-20250504142749208.png and /dev/null differ diff --git a/dist/blog/md/one-step-diffusion-models.assets/image-20250504161430743.png b/dist/blog/md/one-step-diffusion-models.assets/image-20250504161430743.png deleted file mode 100644 index 8c5b463..0000000 Binary files a/dist/blog/md/one-step-diffusion-models.assets/image-20250504161430743.png and /dev/null differ diff --git a/dist/blog/md/one-step-diffusion-models.assets/image-20250504180714955.png b/dist/blog/md/one-step-diffusion-models.assets/image-20250504180714955.png deleted file mode 100644 index 55b3d0c..0000000 Binary files a/dist/blog/md/one-step-diffusion-models.assets/image-20250504180714955.png and /dev/null differ diff --git a/dist/blog/md/one-step-diffusion-models.md b/dist/blog/md/one-step-diffusion-models.md deleted file mode 100644 index 42eebd2..0000000 --- a/dist/blog/md/one-step-diffusion-models.md +++ /dev/null @@ -1,137 +0,0 @@ -# One Step Diffusion Models - -Despite the promising performance of diffusion models on continuous modality generation, one deficiency that is holding them back is their requirement for multi-step denoising processes, which can be computationally expensive. In this article, we examine recent works that aim to build diffusion models capable of performing sampling in one or a few steps. - ---- - -# Background - -Diffusion models (DMs), or more broadly speaking, score-matching generative models, have become the de facto framework for building deep generation models. They demonstrate exceptional generation performance, especially on continuous modalities including images, videos, audios, and spatiotemporal data. - -Most diffusion models work by coupling a forward diffusion process and a reverse denoising diffusion process. The forward diffusion process gradually adds noise to the ground truth clean data $X_0$, until noisy data $X_T$ that follows a relatively simple distribution is reached. The reverse denoising diffusion process starts from the noisy data $X_T$, and removes the noise component step-by-step until clean generated data $X_0$ is reached. The reverse process is essentially a Monte-Carlo process, meaning it cannot be parallelized for each generation, which can be inefficient for a process with a large number of steps. - -![image-20250503125941212](one-step-diffusion-models.assets/image-20250503125941212.png) - -> The two processes in a typical diffusion model. *Source: Ho, Jain, and Abbeel, “Denoising Diffusion Probabilistic Models.”* - -## Understanding DMs - -There are many ways to understand how Diffusion Models (DMs) work. One of the most common and intuitive approaches is that a DM learns an ordinary differential equation (ODE) that transforms noise into data. Imagine an ODE vector field between the noise $X_T$ and clean data $X_0$. By training on sufficiently large numbers of timesteps $t\in [0,T]$, a DM is able to learn the vector (tangent) towards the cleaner data $X_{t-\Delta t}$, given any specific timestep $t$ and the corresponding noisy data $X_t$. This idea is easy to illustrate in a simplified 1-dimensional data scenario. - -![image-20250503132738122](one-step-diffusion-models.assets/image-20250503132738122.png) - -> Illustrated ODE flow of a diffusion model on 1-dimensional data. *Source: Song et al., “Score-Based Generative Modeling through Stochastic Differential Equations.”* It should be noted that as the figure suggests, there are differences between ODEs and DMs in a narrow sense. Flow matching models, a variant of DMs, more closely resemble ODEs. - -## DMs Scale Poorly with Few Steps - -Vanilla DDPM, which is essentially a discrete-timestep DM, can only perform the reverse process using the same number of steps it is trained on, typically thousands. DDIM introduces a reparameterization scheme that enables skipping steps during the reverse process of DDPM. Continuous-timestep DMs like Stochastic Differential Equations (SDE) naturally possess the capability of using fewer steps in the reverse process compared to the forward process/training. - -> Ho, Jain, and Abbeel, “Denoising Diffusion Probabilistic Models.” -> Song, Meng, and Ermon, “Denoising Diffusion Implicit Models.” -> Song et al., “Score-Based Generative Modeling through Stochastic Differential Equations.” - -Nevertheless, it is observed that their performance typically suffers catastrophic degradation when reducing the number of reverse process steps to single digits. - -![image-20250503135351246](one-step-diffusion-models.assets/image-20250503135351246.png) - -> Images generated by conventional DMs with only a few steps of reverse process. *Source: Frans et al., “One Step Diffusion via Shortcut Models.”* - -To understand why DMs scale poorly with few reverse process steps, we can return to the ODE vector field perspective of DMs. When the target data distribution is complex, the vector field typically contains numerous intersections. When a given $X_t$ and $t$ is at these intersections, the vector points to the averaged direction of all candidates. This causes the generated data to approach the mean of the training data when only a few reverse process steps are used. Another explanation is that the learned vector field is highly curved. Using only a few reverse process steps means attempting to approximate these curves with polylines, which is inherently difficult. - -![image-20250503141422791](one-step-diffusion-models.assets/image-20250503141422791.png) - -> Illustration of the why DMs scale poorly with few reverse process steps. *Source: Frans et al., “One Step Diffusion via Shortcut Models.”* - -We will introduce two branches of methods that aim to scale DMs to few or even reverse process steps: **distillation-based**, which distillates a pre-trained DM into a one-step model; and **end-to-end-based**, which trains a one-step DM from scratch. - -# Distallation - -Distillation-based methods are also called **rectified flow** methods. Their idea follows the above insight of "curved ODE vector field": if the curved vectors (flows) are hindering the scaling of reverse process steps, can we try to straighten these vectors so that they are easy to approximate with polylines or even straight lines? - -*Liu, Gong, and Liu, "Flow Straight and Fast"* implements this idea, focusing on learning an ODE that follows straight vectors as much as possible. In the context of continuous time DMs where $T=1$ and and $t\in[0,1]$, suppose the clean data $X_0$ and noise $X_1$ each follows a data distribution, $X_0\sim \pi_0$ and $X_1\sim \pi_1$. The "straight vectors" can be achieved by solving a nonlinear least squares optimization problem: -$$ -\min_{v} \int_{0}^{1} \mathbb{E}\left[\left\|\left(X_{1}-X_{0}\right)-v\left(X_{t}, t\right)\right\|^{2}\right] \mathrm{d} t, -$$ - -$$ -\quad X_{t}=t X_{1}+(1-t) X_{0} -$$ - - - -Where $v$ is the vector field of the ODE $dZ_t = v(Z_t,t)dt$. - -Though straightforward, when the clean data distribution $\pi_0$ is very complicated, the ideal result of completely straight vectors can be hard to achieve. To address this, a "reflow" procedure is introduced. This procedure iteratively trains new rectified flows using data generated by previously obtained flows: -$$ -Z^{(k+1)} = RectFlow((Z_0^k, Z_1^k)) -$$ -This procedure produces increasingly straight flows that can be simulated with very few steps, ideally one step after several iterations. - -![image-20250504142749208](one-step-diffusion-models.assets/image-20250504142749208.png) - -> Illustrations of vector fields after different times of reflow processes. *Source: Liu, Gong, and Liu, “Flow Straight and Fast.”* - -In practice, distillation-based methods are usually trained in two stages: first train a normal DM, and later distill one-step capabilities into it. This introduces additional computational overhead and complexity. - -# End-to-end - -Compared to distillation-based methods, end-to-end-based methods train a one-step-capable diffusion model (DM) within a single training run. Various techniques are used to implement such methods. We will focus on two of them: **consistency models** and **shortcut models**. - -## Consistency Models - -In discrete-timestep diffusion models (DMs), three components in the reverse denoising diffusion process are interchangeable through reparameterization: the noise component $\epsilon_t$ to remove, the less noisy previous step $x_{t-1}$, and the predicted clean sample $x_0$. This interchangeability is enabled by the following equation: -$$ -x_t = \sqrt{\bar{\alpha}_t} \, x_0 + \sqrt{1 - \bar{\alpha}_t} \, \epsilon_t -$$ -In theory, without altering the fundamental formulation of DMs, the learnable denoiser network can be designed to predict any of these three components. Consistency models (CMs) follow this principle by training the denoiser to specifically predict the clean sample $x_0$. The benefit of this approach is that CMs can naturally scale to perform the reverse process with few steps or even a single step. - -![image-20250504161430743](one-step-diffusion-models.assets/image-20250504161430743.png) - -> A consistency model that learns to map any point on the ODE trajectory to the clean sample. *Source: Song et al., “Consistency Models.”* - -Formally, CMs learn a function $f_\theta(x_t,t)$ that maps noisy data $x_t$ at time $t$ directly to the clean data $x_0$, satisfying: -$$ -f_\theta(x_t, t) = f_\theta(x_{t'}, t') \quad \forall t, t' -$$ -The model must also obey the differential consistency condition: -$$ -\frac{d}{dt} f_\theta(x_t, t) = 0 -$$ -CMs are trained by minimizing the discrepancy between outputs at adjacent times, with the loss function: -$$ -\mathcal{L} = \mathbb{E} \left[ d\left(f_\theta(x_t, t), f_\theta(x_{t'}, t')\right) \right] -$$ -Similar to continuous-timestep DMs and discrete-timestep DMs, CMs also have continuous-time and discrete-time variants. Discrete-time CMs are easier to train, but are more sensitive to timestep scheduling and suffer from discretization errors. Continuous-time CMs, on the other hand, suffer from instability during training. - -For a deeper discussion of the differences between the two variants of CMs, and how to stabilize continuous-time CMs, please refer to *Lu and Song, "Simplifying, Stabilizing and Scaling Continuous-Time Consistency Models."* - -## Shortcut Models - -Similar to distillation-based methods, the core idea of shortcut models is inspired by the "curved vector field" problem, but the shortcut models take a different approach to solve it. - -Shortcut models are introduced in *Frans et al., "One Step Diffusion via Shortcut Models."* The paper presents the insight that conventional DMs perform badly when jumping with large step sizes stems from their lack of awareness of the step size they are set to jump forward. Since they are only trained to comply with small step sizes, they are only learning the tangents in the curved vector field, not the "correct direction" when a large step size is used. - -Based on this insight, on top of $x_t$ and $t$, shortcut models additionally include step size $d$ as part of the condition for the denoiser network. At small step sizes ($d\rightarrow 0$), the model behaves like a standard flow-matching model, learning the expected tangent from noise to data. For larger step sizes, the model learns that one large step should equal two consecutive smaller steps (self-consistency), creating a binary recursive formulation. The model is trained by combining the standard flow matching loss when $d=0$ and the self-consistency loss when $d>0$: -$$ -\mathcal{L} = \mathbb{E} [ \underbrace{\| s_\theta(x_t, t, 0) - (x_1 - x_0)\|^2}_{\text{Flow-Matching}} + -$$ - -$$ -\underbrace{\|s_\theta(x_t, t, 2d) - \mathbf{s}_{\text{target}}\|^2}_{\text{Self-Consistency}}], -$$ - -$$ -\quad \mathbf{s}_{\text{target}} = s_\theta(x_t, t, d)/2 + s_\theta(x'_{t+d}, t + d, d)/2 \quad -$$ - -$$ -\text{and} \quad x'_{t+d} = x_t + s_\theta(x_t, t, d)d -$$ - - - -![image-20250504180714955](one-step-diffusion-models.assets/image-20250504180714955.png) - -> Illustration of the training process of shortcut models. *Source: Frans et al., “One Step Diffusion via Shortcut Models.”* - -Both consistency models and shortcut models can be seamlessly scaled between one-step and multi-step generation to balance quality and efficiency. diff --git a/dist/blog/template.html b/dist/blog/template.html deleted file mode 100644 index 9c57d8d..0000000 --- a/dist/blog/template.html +++ /dev/null @@ -1,131 +0,0 @@ - - - - - - - Yan Lin's Blog - {{ title }} - - - - - - - - - - -
-
-
-
- -
-
- -
-
- -
-
-
-
- -
-
- {{ content }} -
-

Copyright © 2025. Designed and implemented by Yan Lin.

-
- - - - - \ No newline at end of file diff --git a/dist/index.html b/dist/index.html index 5c3b430..ca83685 100644 --- a/dist/index.html +++ b/dist/index.html @@ -36,60 +36,95 @@ - - --> @@ -178,7 +195,7 @@ | Designed and implemented by Yan Lin. | - Source Code + Source Code

diff --git a/dist/publications/index.html b/dist/publications/index.html index faac88a..4c62930 100644 --- a/dist/publications/index.html +++ b/dist/publications/index.html @@ -36,24 +36,41 @@ + + --> @@ -70,6 +87,8 @@

+ Paper + Preprint Code @@ -464,7 +483,7 @@ | Designed and implemented by Yan Lin. | - Source Code + Source Code

diff --git a/generate.py b/generate.py index dd166ec..7cc5100 100644 --- a/generate.py +++ b/generate.py @@ -6,28 +6,25 @@ from jinja2 import Environment, FileSystemLoader if __name__ == '__main__': with open('data.yaml', 'r') as file: profile_data = yaml.safe_load(file) - + env = Environment(loader=FileSystemLoader('templates')) - + os.makedirs('dist', exist_ok=True) os.makedirs('dist/publications', exist_ok=True) os.makedirs('dist/projects', exist_ok=True) os.makedirs('dist/presentations', exist_ok=True) - os.makedirs('dist/blog', exist_ok=True) - os.makedirs('dist/blog/html', exist_ok=True) - + def render_template(template_name, output_path, **kwargs): template = env.get_template(template_name) html = template.render(**kwargs) - + with open(output_path, 'w') as file: file.write(html) - + print(f'Generated {output_path}') - + render_template('index.html', 'dist/index.html', data=profile_data, is_home_page=True) render_template('publications.html', 'dist/publications/index.html', data=profile_data, is_home_page=False) render_template('projects.html', 'dist/projects/index.html', data=profile_data, is_home_page=False) render_template('presentations.html', 'dist/presentations/index.html', data=profile_data, is_home_page=False) - render_template('blog.html', 'dist/blog/index.html', data=profile_data, is_home_page=False) - print('Static site generation complete!') + print('Static site generation complete!') diff --git a/parser/md.py b/parser/md.py deleted file mode 100644 index a759ca2..0000000 --- a/parser/md.py +++ /dev/null @@ -1,168 +0,0 @@ -import markdown -import re -import os -import glob -from typing import List - - -def markdown_to_html_paragraphs(markdown_text: str) -> List[str]: - """ - Convert markdown text into a list of HTML paragraphs. - Supports mathematical equations using LaTeX syntax. - - Args: - markdown_text (str): The markdown text to convert - - Returns: - List[str]: A list of HTML paragraphs, each wrapped in

tags - """ - # Prepend "md/" to image paths if they don't already start with md/ - markdown_text = re.sub(r'!\[(.*?)\]\((?!md/)([^/].*?\.assets/.*?)\)', r'![\1](/blog/md/\2)', markdown_text) - - # Check if the first line starts with a # for h1 title - lines = markdown_text.split('\n') - has_h1_title = False - bold_title = None - - if lines and lines[0].strip().startswith('#'): - has_h1_title = True - title_line = lines[0].strip().lstrip('#').strip() - bold_title = f'

{title_line}

' - # Remove the title from the markdown to avoid duplicate processing - markdown_text = '\n'.join(lines[1:]) - else: - raise ValueError("No title found in the markdown file") - - # Configure markdown with math extensions - extensions = [ - 'markdown.extensions.extra', # For blockquotes and other features - 'markdown.extensions.fenced_code', # For code blocks - 'markdown.extensions.codehilite', # For syntax highlighting - 'markdown.extensions.attr_list', # For attributes - 'markdown.extensions.md_in_html', # For markdown inside HTML - 'mdx_math', # For math support - ] - - try: - # Try to use python-markdown-math which outputs compatible with MathJax 3 - import pymdownx.arithmatex - extensions.remove('mdx_math') - extensions.append('pymdownx.arithmatex') - extension_configs = { - 'pymdownx.arithmatex': { - 'generic': True # Uses \(...\) for inline and \[...\] for display math - } - } - except ImportError: - # Fallback to mdx_math - extension_configs = { - 'mdx_math': { - 'enable_dollar_delimiter': True, # Enable $...$ for inline math - } - } - - # Convert markdown to HTML with math support - html = markdown.markdown( - markdown_text, - extensions=extensions, - extension_configs=extension_configs - ) - - html = re.sub(r'

\s*(]+>)\s*

', r'\1', html, flags=re.IGNORECASE) - # Convert image followed by blockquote to figure with caption - html = re.sub( - r']+)>\s*
\s*

(.*?)

\s*
', - r'
\n \n
\2
\n
', - html, - flags=re.DOTALL - ) - - # Add "link" class and target="_blank" to all tags - html = re.sub(r'', r'', html) - html = re.sub(r'', r'', html) - html = re.sub(r'', r'', html) - - # Split the HTML into paragraphs - paragraphs = html.split('\n\n') - - # Clean up and ensure each paragraph is properly wrapped - cleaned_paragraphs = [] - - # Add the bold title as the first element if it exists - if has_h1_title and bold_title: - cleaned_paragraphs.append(bold_title) - - for p in paragraphs: - p = p.strip() - if p: - # If the paragraph doesn't already have

tags, add them - if not (p.startswith('<') and not p.startswith('

')): - p = f'

{p}

' - cleaned_paragraphs.append(p) - - return cleaned_paragraphs, title_line - - -def insert_markdown_into_template(template_path: str, markdown_text: str) -> str: - """ - Insert parsed markdown content into the template HTML file. - - Args: - template_path (str): Path to the template HTML file - markdown_text (str): The markdown text to convert and insert - - Returns: - str: Complete HTML with markdown content inserted - """ - # Parse markdown into HTML paragraphs - html_paragraphs, title_line = markdown_to_html_paragraphs(markdown_text) - - # Read the template - with open(template_path, 'r') as f: - template = f.read() - - # Join paragraphs into a single string - content_html = '\n'.join(html_paragraphs) - - # Insert the content into the template - complete_html = template.replace('{{ content }}', content_html) - - # Replace {{ title }} placeholders with the extracted title - complete_html = complete_html.replace('{{ title }}', title_line) - - return complete_html - - -def process_all_markdown_files(): - """ - Process all markdown files in blog/md/ directory and generate HTML files in blog/html/. - """ - # Get all markdown files in blog/md/ - md_files = glob.glob("dist/blog/md/*.md") - template_path = "dist/blog/template.html" - - for md_file in md_files: - # Extract base filename without extension - base_name = os.path.basename(md_file)[:-3] # Remove .md extension - html_file = f"dist/blog/html/{base_name}.html" - - print(f"Processing {md_file} -> {html_file}") - - try: - # Read the markdown content - with open(md_file, "r") as f: - markdown_text = f.read() - - # Generate HTML content - complete_html = insert_markdown_into_template(template_path, markdown_text) - - # Write HTML output - with open(html_file, "w") as f: - f.write(complete_html) - - except Exception as e: - print(f"Error processing {md_file}: {str(e)}") - - -if __name__ == "__main__": - process_all_markdown_files() \ No newline at end of file diff --git a/shell.nix b/shell.nix index 2f11621..3d5958e 100644 --- a/shell.nix +++ b/shell.nix @@ -13,16 +13,15 @@ pkgs.mkShell { in '' export PIP_REQUIRE_VIRTUALENV=1 export VENV_PATH=${venvPath} - + if [ ! -d $VENV_PATH ]; then python -m venv $VENV_PATH fi source $VENV_PATH/bin/activate pip install -r requirements.txt - python parser/md.py python generate.py - + ${if isDev then '' pip install watchdog==6.0.0 python watch.py && exit diff --git a/templates/base.html b/templates/base.html index cbb67ad..61d50a2 100644 --- a/templates/base.html +++ b/templates/base.html @@ -54,7 +54,7 @@ | Designed and implemented by Yan Lin. | -
Source Code + Source Code

diff --git a/templates/blog.html b/templates/blog.html deleted file mode 100644 index 954c14e..0000000 --- a/templates/blog.html +++ /dev/null @@ -1,18 +0,0 @@ -{% extends 'base.html' %} - -{% block title %}Yan Lin's Blog{% endblock %} - -{% block header_title %}Yan Lin's Blog{% endblock %} - -{% block navigation %} -{% endblock %} - -{% block content %} -
-
- {% for blog in data.blogs %} - {% include 'partials/blog.html' %} - {% endfor %} -
-
-{% endblock %} \ No newline at end of file diff --git a/templates/index.html b/templates/index.html index 4bcc887..a37c950 100644 --- a/templates/index.html +++ b/templates/index.html @@ -1,113 +1,122 @@ -{% extends 'base.html' %} - -{% block title %}Yan Lin's Homepage{% endblock %} - -{% block navigation %} - {% include 'partials/navigation.html' %} -{% endblock %} - -{% block content %} -
+{% extends 'base.html' %} {% block title %}Yan Lin's Homepage{% endblock %} {% +block navigation %} {% include 'partials/navigation.html' %} {% endblock %} {% +block content %} +
-

Biography - Yan Lin

-

- I am currently a postdoctoral researcher in the Department of Computer Science at Aalborg University. - I received my PhD and Bachelor's degrees from Beijing Jiaotong University, China. - My research interests include spatiotemporal data mining, representation learning, and AI for science. -

+

Biography - Yan Lin

+

+ I am currently a postdoctoral researcher in the Department of + Computer Science at Aalborg University. I received my PhD and + Bachelor's degrees from Beijing Jiaotong University, China. My + research interests include spatiotemporal data mining, + representation learning, and AI for science. +

-
- Yan Lin +
+ Yan Lin
-
+
-
+
-

Publications

- View All +

+ Publications +

+ View All
-
- {% for pub in data.primaryPublications[:10] %} - {% with type='primary' %} - {% include 'partials/publication.html' %} - {% endwith %} - {% endfor %} -
-
-
- {% for pub in data.secondaryPublications[:10] %} - {% with type='secondary' %} - {% include 'partials/publication.html' %} - {% endwith %} - {% endfor %} -
+
+ {% for pub in data.primaryPublications[:10] %} {% with + type='primary' %} {% include 'partials/publication.html' %} {% + endwith %} {% endfor %} +
+
+
+ {% for pub in data.secondaryPublications[:10] %} {% with + type='secondary' %} {% include 'partials/publication.html' %} {% + endwith %} {% endfor %} +
- * Equal Contribution + * Equal Contribution
-
+
-
+
-

Projects

- View All +

+ Projects +

+ View All
-
- {% for project in data.primaryProjects[:3] %} - {% with type='primary' %} - {% include 'partials/project.html' %} - {% endwith %} - {% endfor %} -
-
-
- {% for project in data.secondaryProjects[:3] %} - {% with type='secondary' %} - {% include 'partials/project.html' %} - {% endwith %} - {% endfor %} -
+
+ {% for project in data.primaryProjects[:3] %} {% with type='primary' + %} {% include 'partials/project.html' %} {% endwith %} {% endfor %} +
+
+
+ {% for project in data.secondaryProjects[:3] %} {% with + type='secondary' %} {% include 'partials/project.html' %} {% endwith + %} {% endfor %} +
-
+
-
+
-

Presentations

- View All +

+ Presentations +

+ View All
- {% for presentation in data.presentations[:5] %} - {% include 'partials/presentation.html' %} - {% endfor %} + {% for presentation in data.presentations[:5] %} {% include + 'partials/presentation.html' %} {% endfor %}
-
+
-
+

Services

-
    - {% for service in data.services %} -
  • {{ service }}
  • - {% endfor %} -
+
    + {% for service in data.services %} +
  • {{ service }}
  • + {% endfor %} +
-
+
-
-
-

Blog

- View All -
-
- {% for blog in data.blogs[:3] %} - {% include 'partials/blog.html' %} - {% endfor %} -
-
-{% endblock %} - -{% block extra_js %} - {{ super() }} -{% endblock %} \ No newline at end of file +{% endblock %} {% block extra_js %} {{ super() }} {% endblock %} diff --git a/templates/partials/blog.html b/templates/partials/blog.html deleted file mode 100644 index 3a47283..0000000 --- a/templates/partials/blog.html +++ /dev/null @@ -1,6 +0,0 @@ -
- - {{ blog.title }} - {{ blog.badge }} -

{{ blog.tldr }}

-
\ No newline at end of file diff --git a/templates/partials/navigation.html b/templates/partials/navigation.html index e6ef010..074cd9e 100644 --- a/templates/partials/navigation.html +++ b/templates/partials/navigation.html @@ -1,19 +1,36 @@ \ No newline at end of file + + diff --git a/watch.py b/watch.py index 7f1764f..a828b69 100644 --- a/watch.py +++ b/watch.py @@ -13,7 +13,7 @@ class ChangeHandler(FileSystemEventHandler): if any(event.src_path.endswith(ext) for ext in ['.md', '.py', '.html', '.css', '.js', '.yaml']): print(f"File {event.src_path} has been modified") self.regenerate() - + def on_created(self, event): if event.is_directory: return @@ -22,10 +22,9 @@ class ChangeHandler(FileSystemEventHandler): if any(event.src_path.endswith(ext) for ext in ['.md', '.py', '.html', '.css', '.js', '.yaml']): print(f"File {event.src_path} has been created") self.regenerate() - + def regenerate(self): print("Regenerating content...") - subprocess.run(["python", "parser/md.py"]) subprocess.run(["python", "generate.py"]) print("Content regenerated") @@ -34,9 +33,9 @@ if __name__ == "__main__": observer = Observer() observer.schedule(event_handler, ".", recursive=True) observer.start() - + http_server = subprocess.Popen(["python", "-m", "http.server", "8000", "--directory", "dist"]) - + try: print("Watching for file changes... (Press Ctrl+C to stop)") while True: @@ -44,4 +43,4 @@ if __name__ == "__main__": except KeyboardInterrupt: observer.stop() http_server.terminate() - observer.join() \ No newline at end of file + observer.join()