In [1]:
# import audio
import librosa
import IPython.display as ipd
from IPython.display import YouTubeVideo
import numpy as np
from IPython.core.display import HTML
In [2]:
# Show multiple line result
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
In [3]:
# show title here

GENERATING STRUCTURED DRUM PATTERN

USING VARIATIONAL AUTOENCODER AND SELF-SIMILARITY MATRIX

Example Audio and Code


I-Chieh Wei1 ,   Chih-Wei Wu2 ,   Li Su1

1Institute of Information Science, Academia Sinica, Taiwan
2Netflix, Inc., USA
sma1033@iis.sinica.edu.tw, chihweiw@netflix.com, lisu@iis.sinica.edu.tw

Generating drum patterns with structural information

Drum pattern generation is a task that focuses on the rhythmic aspect of music and aims at generating percussive sequences. However, one of the main challenges is to generate structurally cohesive sequences. In this study, We present a drum pattern generation model based on Variational Autoencoders (VAEs) to utilize self-similarity matrix (SSM) for encapsulating structural information.

To test the effectiveness of the proposed model, we compare the drum patterns from four generation methods:
     1. OMD - Original MIDI Drums.
     2. ODS - Original Drum SSM.
     3. PDS - Predicted Drum SSM.
     4. NB - Neighboring Bars.

If you are interested in model detail, please refer to our paper.

Audio samples from OMD, ODS, PDS, NB

In [4]:
%%html
<p class="sample">
    &nbsp;
    <pre><b>         OMD Sample 1                    ODS Sample 1                     PDS Sample 1                   NB Sample 1</b></pre>
    <audio src="./audio/sample_01_omd.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_01_ods.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_01_pds.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_01_nb.wav"  controls="" style="width: 240px;"></audio>
    <br>
    <br>
    <pre><b>         OMD Sample 2                    ODS Sample 2                     PDS Sample 2                   NB Sample 2</b></pre>
    <audio src="./audio/sample_02_omd.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_02_ods.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_02_pds.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_02_nb.wav"  controls="" style="width: 240px;"></audio>
    <br>
    <br>
    <pre><b>         OMD Sample 3                    ODS Sample 3                     PDS Sample 3                   NB Sample 3</b></pre>
    <audio src="./audio/sample_03_omd.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_03_ods.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_03_pds.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_03_nb.wav"  controls="" style="width: 240px;"></audio>
    <br>
    <br>
    <pre><b>         OMD Sample 4                    ODS Sample 4                     PDS Sample 4                   NB Sample 4</b></pre>
    <audio src="./audio/sample_04_omd.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_04_ods.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_04_pds.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_04_nb.wav"  controls="" style="width: 240px;"></audio>
    <br>
    <br>
    <pre><b>         OMD Sample 5                    ODS Sample 5                     PDS Sample 5                   NB Sample 5</b></pre>
    <audio src="./audio/sample_05_omd.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_05_ods.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_05_pds.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_05_nb.wav"  controls="" style="width: 240px;"></audio>
    <br>
    <br>
    <pre><b>         OMD Sample 6                    ODS Sample 6                     PDS Sample 6                   NB Sample 6</b></pre>
    <audio src="./audio/sample_06_omd.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_06_ods.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_06_pds.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_06_nb.wav"  controls="" style="width: 240px;"></audio>
    <br>
    <br>
    <pre><b>         OMD Sample 7                    ODS Sample 7                     PDS Sample 7                   NB Sample 7</b></pre>
    <audio src="./audio/sample_07_omd.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_07_ods.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_07_pds.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_07_nb.wav"  controls="" style="width: 240px;"></audio>
    <br>
    <br>
    <pre><b>         OMD Sample 8                    ODS Sample 8                     PDS Sample 8                   NB Sample 8</b></pre>
    <audio src="./audio/sample_08_omd.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_08_ods.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_08_pds.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_08_nb.wav"  controls="" style="width: 240px;"></audio>
    <br>
    <br>
    <pre><b>         OMD Sample 9                    ODS Sample 9                     PDS Sample 9                   NB Sample 9</b></pre>
    <audio src="./audio/sample_09_omd.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_09_ods.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_09_pds.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_09_nb.wav"  controls="" style="width: 240px;"></audio>
    <br>
    <br>
    <pre><b>         OMD Sample 10                   ODS Sample 10                    PDS Sample 10                  NB Sample 10</b></pre>
    <audio src="./audio/sample_10_omd.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_10_ods.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_10_pds.wav" controls="" style="width: 240px;"></audio>
    <audio src="./audio/sample_10_nb.wav"  controls="" style="width: 240px;"></audio>
</p>

 

         OMD Sample 1                    ODS Sample 1                     PDS Sample 1                   NB Sample 1


         OMD Sample 2                    ODS Sample 2                     PDS Sample 2                   NB Sample 2


         OMD Sample 3                    ODS Sample 3                     PDS Sample 3                   NB Sample 3


         OMD Sample 4                    ODS Sample 4                     PDS Sample 4                   NB Sample 4


         OMD Sample 5                    ODS Sample 5                     PDS Sample 5                   NB Sample 5


         OMD Sample 6                    ODS Sample 6                     PDS Sample 6                   NB Sample 6


         OMD Sample 7                    ODS Sample 7                     PDS Sample 7                   NB Sample 7


         OMD Sample 8                    ODS Sample 8                     PDS Sample 8                   NB Sample 8


         OMD Sample 9                    ODS Sample 9                     PDS Sample 9                   NB Sample 9


         OMD Sample 10                   ODS Sample 10                    PDS Sample 10                  NB Sample 10


Controllable rhythm complexity

As described in section 3.4, we use $\hat c$ in drum VAE latent space to represent the note density (rhythm complexity) of the VAE model output. Adding a number to $\hat c$ in latent space changes the output note count and therefor make the generation result more diverse. In the following examples, we demonstrate the effect of replacing $\hat c$ with $\hat c + \alpha$ ($\alpha = \{6,20\}$) on two different songs. Drum tracks are shown for detailed visual inspection.

Example 1 Drag Racing

In [5]:
%%html
<p class="sample">
    &nbsp;
    <pre><b>            (1) OMD                        (2) PDS                        (3) PDS(+6)                     (4)PDS(+20)</b></pre>
    <audio src="./adj_chat_sample/audio_00244_omd.wav" controls="" style="width: 240px;"></audio>
    <audio src="./adj_chat_sample/audio_00244_pds.wav" controls="" style="width: 240px;"></audio>
    <audio src="./adj_chat_sample/audio_00244_pds_p06n.wav" controls="" style="width: 240px;"></audio>
    <audio src="./adj_chat_sample/audio_00244_pds_p20n.wav"  controls="" style="width: 240px;"></audio>
</p>

 

            (1) OMD                        (2) PDS                        (3) PDS(+6)                     (4)PDS(+20)

Example 2 Drag Racing

In [6]:
%%html
<p class="sample">
    &nbsp;
    <pre><b>            (1) OMD                        (2) PDS                        (3) PDS(+6)                     (4)PDS(+20)</b></pre>
    <audio src="./adj_chat_sample/audio_07942_omd.wav" controls="" style="width: 240px;"></audio>
    <audio src="./adj_chat_sample/audio_07942_pds.wav" controls="" style="width: 240px;"></audio>
    <audio src="./adj_chat_sample/audio_07942_pds_p06n.wav" controls="" style="width: 240px;"></audio>
    <audio src="./adj_chat_sample/audio_07942_pds_p20n.wav"  controls="" style="width: 240px;"></audio>
</p>

 

            (1) OMD                        (2) PDS                        (3) PDS(+6)                     (4)PDS(+20)


Generating drum track for real-world audio

One of the goals of this project is to generate accompaniment drum track to support the rhythmicity part of a song for a solo player. Therefore, we test the capability of our model on random songs downloaded from youtube. The experimental result suggests that our model still have issues to generate consistent drum patterns within a musical section when applying on real-world audio domain data. We believe the cause of this is related to the large difference between real-world audio SSM and the symbolic one. How to shorten the gap between the two remains a challenging problem and needs more exploration.

In [7]:
%%html
<p class="sample">
    &nbsp;
    <pre><b>      real-world audio sample 1           real-world audio sample 2 </b></pre>
    <audio src="./audio/sample_rs07.wav" controls="" style="width: 270px;"></audio>
    <audio src="./audio/sample_rs08.wav" controls="" style="width: 270px;"></audio><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
    <b><em><a href="https://www.youtube.com/watch?v=xIuryqXx7fY" target="_blank">source audio Youtube link</a></em></b>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
    <b><em><a href="https://www.youtube.com/watch?v=SrnCvD2OHlQ" target="_blank">source audio Youtube link</a></em></b>
</p>

 

      real-world audio sample 1           real-world audio sample 2 

           source audio Youtube link                          source audio Youtube link

In [8]:
#<font size="1"> This is my text number 1</font><br>
#<font size="2"> This is my text number 2</font><br>
#<font size="3"> This is my text number 3</font><br>
#<font size="4"> This is my text number 4</font><br>
#<font size="5"> This is my text number 5</font><br>
#<font size="6"> This is my text number 6</font>
In [9]:
#wav_file = "./testing_10_songs/test_wavs_cutted/test_song_01.wav"
#audio_data, samp_rate = librosa.load(wav_file, duration=10, mono=True, sr=44100)

#print ("Audio time: {}".format(audio_data.shape[0]/samp_rate))
#print ("Audio data: {}".format(audio_data.shape[0]))
#print ("Sample rate: {}".format(samp_rate))

## calculate CQT data
#audio_cqt = librosa.cqt(audio_data, sr=samp_rate, hop_length=512, n_bins=84, bins_per_octave=12)
#audio_mag, audio_phase = librosa.magphase(audio_cqt)
#audio_spec = np.transpose((librosa.amplitude_to_db(audio_mag, ref=np.max) * 1.25) + 100)
#cqt_fps = audio_spec.shape[0]/(audio_data.shape[0]/samp_rate)

#print ("CQT Frame data shape: {}".format(audio_spec.shape))
#print ("CQT FPS: {:.2f}".format(cqt_fps))
In [10]:
#ipd.Audio(audio_data[:44100*5], rate=44100)
#ipd.Audio(audio_data[:44100*8], rate=44100)
In [11]:
#my_video = YouTubeVideo('FQw2CX6bn7Q')
In [12]:
#display(my_video)
In [13]:
# convert nb file into html
In [14]:
!jupyter nbconvert --to html drum_vae_with_ssm_html.ipynb

In [15]:
%run -i 'remove_nb_input_cell.py'