Skip to content

Discussion : ussing nonlinearity and normalization in the operations #187

Open
@jakubMitura14

Description

@jakubMitura14

Hello I am using the tensor contraction to as a layer in Lux.jl - and it works no problem, here implementation if it would be usefull for somebody.


struct TensorOpLayer_str <: Lux.AbstractExplicitLayer
    param_shape
    operation_expression::Expr
end

function Lux.initialparameters(rng::AbstractRNG, l::TensorOpLayer_str)
    P=rand(rng,Float32,l.param_shape...)
    return (P=P,)
end

function extract_double_brackets(s::String)
    pattern = r"\(\(.*?\)\)"
    matches = eachmatch(pattern, s)
    return [match.match for match in matches]
end

function parse_tuple_from_string(s)
    # Parse the string into an expression
    expr = Meta.parse(s)
    
    # Evaluate the expression to get the tuple
    result = eval(expr)
    
    return result
end


function Lux.initialstates(::AbstractRNG, l::TensorOpLayer_str)::NamedTuple
    ex=l.operation_expression
    parser=TensorOperations.tensorparser(ex,:allocator=>TensorOperations.CUDAAllocator(),:backend=>TensorOperations.cuTENSORBackend())
    parsed_ex=parser(ex)
    arg_tuples=extract_double_brackets(string(parsed_ex.args[3]))
    arg_tuples=map(parse_tuple_from_string,arg_tuples)

    return (PA=arg_tuples[1],PB=arg_tuples[2],PC=arg_tuples[3])

end


function (l::TensorOpLayer_str)(x, ps, st::NamedTuple)

    res = TensorOperations.tensoralloc_contract(Float32, x, st.PA, false, ps.P, st.PB, false, st.PC, Val{false}(), TensorOperations.CUDAAllocator{CUDA.UnifiedMemory, CUDA.DeviceMemory, CUDA.DeviceMemory}())
    res = TensorOperations.tensorcontract!(res, x, st.PA, false, ps.P, st.PB, false, st.PC, VectorInterface.One(), VectorInterface.Zero(), TensorOperations.cuTENSORBackend(), TensorOperations.CUDAAllocator{CUDA.UnifiedMemory, CUDA.DeviceMemory, CUDA.DeviceMemory}())
    res=swish.(res)

    return res, st
end

Maybe it is not perfect as it does some not pretty reverse engineering as I was unable to get a return statement from tensor macro when expression is passed to a layer state but it works.

Then I invoke them like

            , TensorOpLayer_str((num_directions, primary_sv_repr, num_channels), :(res[b, f, n, e] := x[b, c, f, n, d] * P[d, e, c]))
            , GroupNorm(num_params_exec, num_params_exec, affine=true)
            , TensorOpLayer_str((num_params_exec, primary_sv_repr, final_sv_repr), :(res[b, f, e] := x[b, f, n, d] * P[n, d, e]))
            , LayerNorm((batch_size, flat_sv_len))

this is only part of a network - I had divided operation into multiple smaller operations to do non linearity (here swish) and normalization (numbers are growing fast and leads to instabilities in training without some kind of normalization).

Hovewer and here some problem - I know that operation would be far more performant I I would combine operations like

res[b, f, n, e] := x[b, c, f, n, d] * P[d, e, c])
res[b, f, e] := x[b, f, n, d] * P[n, d, e]

etc into one operation - Hovewer In this way I would be unable to add nonlinearities and normalization between each step - is there a better way to achieve this then what I did ?

Thanks for help !

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions