This post is examining the method of aligning transcribed midi into its corresponding audio file by using beat tracking tool(madmom). The code is from the Paper Compound Word Transformer.
we use madmom, you can check the usage of it in the following link.
https://madmom.readthedocs.io/en/v0.16/modules/features/downbeats.html
from madmom.features.downbeats import DBNDownBeatTrackingProcessor
from madmom.features.downbeats import RNNDownBeatProcessor
def estimate_beat(path_audio):
proc = DBNDownBeatTrackingProcessor(beats_per_bar=[3, 4], fps=100)
act = RNNDownBeatProcessor()(path_audio)
proc_res = proc(act)
return proc_res
proc_res[:10]
array([[0.41, 1. ],
[0.96, 2. ],
[1.51, 3. ],
[2.04, 4. ],
[2.58, 1. ],
[3.11, 2. ],
[3.67, 3. ],
[4.22, 4. ],
[4.75, 1. ],
[5.27, 2. ]])
# compute tempo
beats = np.array([0.0] + list(proc_res[:, 0]))
intervals = np.diff(beats)
bpms = 60 / intervals
tempo_info = list(zip(beats[:-1], bpms))
because they used tick based midi parsing tool miditoolkit, they need to change tick into absolute time. It's obvious that ticks without properly aligned tempos are totally wrong values.
# get absolute timing of instruments
tick_to_time = midi_data.get_tick_to_time_mapping()
abs_instr = get_instruments_abs_timing(midi_data.instruments, tick_to_time)
Here, the sampling tick rate between beats is 480
And, we can realize that each ticks has its correspondant absolute times.
# compute time to tick mapping
resample_timing = []
for i in range(len(beats)-1):
start_beat = beats[i]
end_beat = beats[i + 1]
resample_timing += interp_linear(start_beat, end_beat, ticks_per_beat)
interp_linear(beats[0], beats[1], 480)
[0.0, 0.0008541666666666666, 0.0017083333333333332, 0.0025624999999999997, 0.0034166666666666664, 0.004270833333333333, 0.005124999999999999, 0.0059791666666666665, 0.006833333333333333, 0.007687499999999999, 0.008541666666666666, 0.009395833333333332, 0.010249999999999999, 0.011104166666666665, ...]
len(resample_timing)
138720
def convert_instruments_timing_from_abs_to_sym(instruments, time_to_tick):
proc_instrs = copy.deepcopy(instruments)
for instr in proc_instrs:
for note in instr.notes:
# find nearest
note.start = find_nearest_np(time_to_tick, note.start)
note.end = find_nearest_np(time_to_tick, note.end)
return proc_instrs
# convert abs to sym
sym_instr = convert_instruments_timing_from_abs_to_sym(abs_instr, resample_timing)
in CP, they only consider 4/4 time signature
# time signature
first_db_sec = find_first_downbeat(proc_res)
first_db_tick = find_nearest_np(resample_timing, first_db_sec)
time_signature_changes = [TimeSignature(numerator=4, denominator=4, time=int(first_db_tick))]
# tempo
tempo_changes = []
for pos, bpm in tempo_info:
pos_tick = find_nearest_np(resample_timing, pos)
tempo_changes.append(TempoChange(tempo=float(bpm), time=int(pos_tick)))
As the first beat start in somewhere randomly, we move that first beat using shift so that the first beat can start from at the beginning of measure. We should know that shift is applied to the first beat not a first note of tune, which means it is valid for intended structure of incomplete bar.
# shift (pickup at the beginning)
shift_align = ticks_per_beat * 4 - first_db_tick
# apply shift to tempo
for msg in tempo_changes:
msg.time += shift_align
# apply shift to notes
for instr in sym_instr:
for note in instr.notes:
note.start += shift_align
note.end += shift_align
shift_align
1440